[llvm] [SelectionDAG] Fix condition used for unsigned subtraction overflow (PR #170896)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 07:12:50 PST 2025
https://github.com/aabhinavg1 updated https://github.com/llvm/llvm-project/pull/170896
>From b268573d6a03fe14e22a7703dcd6a284e5d0ca9a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:43:14 +0530
Subject: [PATCH 1/8] [Instcombine] Lower to explicit subtraction + unsigned
comparison
---
.../InstCombine/InstCombineCalls.cpp | 13 ++++
.../test/Transforms/InstCombine/known-bits.ll | 15 +++--
llvm/test/Transforms/InstCombine/pr170634.ll | 33 ++++++++++
...ult-of-usub-is-non-zero-and-no-overflow.ll | 60 +++++++++----------
.../usub-overflow-known-by-implied-cond.ll | 40 +++++--------
llvm/test/Transforms/InstCombine/usubo.ll | 10 ++--
.../Transforms/InstCombine/with_overflow.ll | 7 ++-
7 files changed, 108 insertions(+), 70 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/pr170634.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 743c4f574e131..af85985843914 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,6 +864,19 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
WO->getRHS(), *WO, OperationResult, OverflowResult))
return createOverflowTuple(WO, OperationResult, OverflowResult);
+
+ // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
+ if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
+ IRBuilder<> Builder(WO);
+ Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
+ Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
+
+ Value *ResultStruct = UndefValue::get(WO->getType());
+ ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
+ ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
+
+ return replaceInstUsesWith(*WO, ResultStruct);
+ }
// See whether we can optimize the overflow check with assumption information.
for (User *U : WO->users()) {
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index da2123a5dfe74..fc73ce5503ffe 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
; CHECK-LABEL: @extract_value_usub(
; CHECK-NEXT: [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT: [[SUB:%.*]] = xor i8 [[ZZ]], -1
+; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: ret i1 false
+; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[ZZ]], -1
+; CHECK-NEXT: ret i1 [[R]]
;
%z = add nuw i8 %zz, 1
%y = add i8 %x, %z
@@ -1090,12 +1090,11 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
; CHECK-LABEL: @extract_value_usub_fail(
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
-; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[Z]]
+; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[SUB]], 0
+; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[Z]], 0
; CHECK-NEXT: ret i1 [[R]]
;
%y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
new file mode 100644
index 0000000000000..62a332e14b04a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
+; CHECK-LABEL: @func(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: br label [[RETURN:%.*]]
+; CHECK: if.end:
+; CHECK-NEXT: [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT: br label [[RETURN]]
+; CHECK: return:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
+; CHECK-NEXT: ret i64 [[RETVAL_0]]
+;
+entry:
+ %0 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+ %1 = extractvalue { i64, i1 } %0, 1
+ %2 = extractvalue { i64, i1 } %0, 0
+ br i1 %1, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ br label %return
+
+if.end: ; preds = %entry
+ br label %return
+
+return: ; preds = %if.end, %if.then
+ %retval.0 = phi i64 [ 291, %if.then ], [ %2, %if.end ]
+ ret i64 %retval.0
+}
+
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 30a5072c7edc8..46b8a853e6cf5 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
define i1 @t2(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
define i1 @t2_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
define i1 @t6_commutability(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
define i1 @t8(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
define i1 @t8_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
-; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
+; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index 90ca39a70a0bb..c9030e5ab0321 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,11 +175,10 @@ define i32 @test7(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -205,11 +204,10 @@ define i32 @test8(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -296,11 +294,10 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -328,11 +325,10 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -360,11 +356,10 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -392,11 +387,10 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -424,11 +418,10 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -456,11 +449,10 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
+; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
+; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index 2074190a2cd45..e4b9c0e08ba22 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,10 +130,9 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use(i1 [[OV]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
; CHECK-NEXT: ret i1 [[EQ1]]
;
@@ -149,10 +148,9 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
; CHECK-NEXT: call void @use(i1 [[OV]])
-; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
; CHECK-NEXT: ret i1 [[SGT0]]
;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index fa810408730e1..4f7a15cc89d6c 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+; RUN: opt -passes='instcombine<no-verify-fixpoint>' -S < %s | FileCheck %s
declare { i8, i1 } @llvm.uadd.with.overflow.i8(i8, i8) nounwind readnone
declare { i8, i1 } @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
@@ -506,7 +506,10 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT: [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
+; CHECK-NEXT: [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 42, [[X]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT: [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
; CHECK-NEXT: ret { i32, i1 } [[A]]
;
%a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)
>From e3fdf8dd13a1a8c3fc3ea7dd1916762d95276570 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Fri, 5 Dec 2025 23:46:22 +0530
Subject: [PATCH 2/8] formated with git clang-format HEAD~1
---
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index af85985843914..3bd7eb855b147 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -864,17 +864,17 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
WO->getRHS(), *WO, OperationResult, OverflowResult))
return createOverflowTuple(WO, OperationResult, OverflowResult);
-
- // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
+
+ // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
IRBuilder<> Builder(WO);
Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-
+
Value *ResultStruct = UndefValue::get(WO->getType());
ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-
+
return replaceInstUsesWith(*WO, ResultStruct);
}
>From aeef41f725b96ec57f72c2eb9788735419ae7172 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 00:27:48 +0530
Subject: [PATCH 3/8] fix formatting and replace undef with poison
---
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 2 +-
.../result-of-usub-is-non-zero-and-no-overflow.ll | 12 ++++++------
llvm/test/Transforms/InstCombine/with_overflow.ll | 2 +-
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3bd7eb855b147..d0b71f12c3159 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -871,7 +871,7 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
- Value *ResultStruct = UndefValue::get(WO->getType());
+ Value *ResultStruct = PoisonValue::get(WO->getType());
ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index 46b8a853e6cf5..f8b318bc3680a 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -143,7 +143,7 @@ define i1 @t2(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -170,7 +170,7 @@ define i1 @t2_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2_logical(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -323,7 +323,7 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -350,7 +350,7 @@ define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability_logical(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -461,7 +461,7 @@ define i1 @t8(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
@@ -484,7 +484,7 @@ define i1 @t8_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8_logical(
; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } undef, i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 4f7a15cc89d6c..0c82bdc256ddf 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -508,7 +508,7 @@ define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } undef, i32 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
; CHECK-NEXT: [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
; CHECK-NEXT: ret { i32, i1 } [[A]]
;
>From 87d56d3d369db1fef1789ccbc3f7890e30daa96a Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sat, 6 Dec 2025 23:16:05 +0530
Subject: [PATCH 4/8] Address review feedback
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 4 +-
.../InstCombine/InstCombineCalls.cpp | 13 -
.../test/CodeGen/RISCV/arith-with-overflow.ll | 7 +-
.../CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll | 24 +-
llvm/test/CodeGen/RISCV/rvv/abs-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll | 44 +-
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll | 46 +-
llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll | 68 +-
.../CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-bitreverse-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-bswap-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-ceil-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 432 ++-
.../RISCV/rvv/fixed-vectors-ctpop-vp.ll | 76 +-
.../RISCV/rvv/fixed-vectors-cttz-vp.ll | 244 +-
.../RISCV/rvv/fixed-vectors-floor-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-fmaximum-vp.ll | 77 +-
.../RISCV/rvv/fixed-vectors-fminimum-vp.ll | 77 +-
.../RISCV/rvv/fixed-vectors-fpext-vp.ll | 8 +-
.../RISCV/rvv/fixed-vectors-fptosi-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-fptoui-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-fptrunc-vp.ll | 8 +-
.../RISCV/rvv/fixed-vectors-nearbyint-vp.ll | 32 +-
.../rvv/fixed-vectors-reduction-fp-vp.ll | 16 +-
.../rvv/fixed-vectors-reduction-int-vp.ll | 8 +-
.../rvv/fixed-vectors-reduction-mask-vp.ll | 8 +-
.../RISCV/rvv/fixed-vectors-rint-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-round-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-roundeven-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-roundtozero-vp.ll | 64 +-
.../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-setcc-int-vp.ll | 50 +-
.../RISCV/rvv/fixed-vectors-sext-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-sitofp-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-strided-vpload.ll | 74 +-
.../rvv/fixed-vectors-strided-vpstore.ll | 18 +-
.../RISCV/rvv/fixed-vectors-trunc-vp.ll | 299 +-
.../RISCV/rvv/fixed-vectors-uitofp-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vadd-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vcopysign-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfabs-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfma-vp.ll | 48 +-
.../RISCV/rvv/fixed-vectors-vfmax-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfmin-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll | 48 +-
.../RISCV/rvv/fixed-vectors-vfneg-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vfsqrt-vp.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vmax-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vmaxu-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vmin-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vminu-vp.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vpgather.ll | 184 +-
.../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll | 24 +-
.../RISCV/rvv/fixed-vectors-vpmerge.ll | 16 +-
.../RISCV/rvv/fixed-vectors-vpscatter.ll | 64 +-
.../RISCV/rvv/fixed-vectors-vpstore.ll | 8 +-
.../RISCV/rvv/fixed-vectors-vsadd-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vsaddu-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vselect-vp.ll | 28 +-
.../RISCV/rvv/fixed-vectors-vssub-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-vssubu-vp.ll | 32 +-
.../RISCV/rvv/fixed-vectors-zext-vp.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/floor-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 68 +-
llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 68 +-
llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll | 52 +-
.../RISCV/rvv/nontemporal-vp-scalable.ll | 3010 ++++++++---------
llvm/test/CodeGen/RISCV/rvv/rint-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/round-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll | 869 +++--
llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll | 297 +-
llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll | 50 +-
.../test/CodeGen/RISCV/rvv/strided-vpstore.ll | 82 +-
llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll | 531 +--
llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll | 276 +-
llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll | 2956 ++++------------
llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll | 52 +-
llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll | 139 +-
llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll | 12 +-
llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll | 276 +-
llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll | 18 +-
llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/vp-splat.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/vp-splice.ll | 140 +-
.../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll | 28 +-
llvm/test/CodeGen/RISCV/rvv/vpload.ll | 34 +-
llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll | 18 +-
.../CodeGen/RISCV/rvv/vpscatter-sdnode.ll | 16 +-
llvm/test/CodeGen/RISCV/rvv/vpstore.ll | 20 +-
.../CodeGen/RISCV/rvv/vreductions-fp-vp.ll | 4 +-
.../CodeGen/RISCV/rvv/vreductions-int-vp.ll | 2 +-
.../CodeGen/RISCV/rvv/vreductions-mask-vp.ll | 2 +-
llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll | 30 +-
llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll | 12 +-
llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll | 8 +-
llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll | 4 +-
llvm/test/CodeGen/RISCV/usub_sat.ll | 48 +-
llvm/test/CodeGen/RISCV/usub_sat_plus.ll | 44 +-
llvm/test/CodeGen/RISCV/xaluo.ll | 129 +-
llvm/test/CodeGen/RISCV/xqcia.ll | 6 +-
.../test/Transforms/InstCombine/known-bits.ll | 15 +-
llvm/test/Transforms/InstCombine/pr170634.ll | 5 +-
...ult-of-usub-is-non-zero-and-no-overflow.ll | 60 +-
.../usub-overflow-known-by-implied-cond.ll | 40 +-
llvm/test/Transforms/InstCombine/usubo.ll | 10 +-
.../Transforms/InstCombine/with_overflow.ll | 5 +-
132 files changed, 5135 insertions(+), 7744 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 172c7485e108b..8b46c4c1e66db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11466,7 +11466,9 @@ void TargetLowering::expandUADDSUBO(
DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETNE);
} else {
ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
- SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
+ SDValue CompareLHS = IsAdd ? Result : LHS;
+ SDValue CompareRHS = IsAdd ? LHS : RHS;
+ SetCC = DAG.getSetCC(dl, SetCCType, CompareLHS, CompareRHS, CC);
}
Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d0b71f12c3159..743c4f574e131 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -865,19 +865,6 @@ InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
WO->getRHS(), *WO, OperationResult, OverflowResult))
return createOverflowTuple(WO, OperationResult, OverflowResult);
- // Transform: usub.with.overflow(X, Y) -> {X - Y, X u< Y}
- if (WO->getBinaryOp() == Instruction::Sub && !WO->isSigned()) {
- IRBuilder<> Builder(WO);
- Value *Sub = Builder.CreateSub(WO->getLHS(), WO->getRHS());
- Value *Overflow = Builder.CreateICmpULT(WO->getLHS(), WO->getRHS());
-
- Value *ResultStruct = PoisonValue::get(WO->getType());
- ResultStruct = Builder.CreateInsertValue(ResultStruct, Sub, 0);
- ResultStruct = Builder.CreateInsertValue(ResultStruct, Overflow, 1);
-
- return replaceInstUsesWith(*WO, ResultStruct);
- }
-
// See whether we can optimize the overflow check with assumption information.
for (User *U : WO->users()) {
if (!match(U, m_ExtractValue<1>(m_Value())))
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 557b4b7c2afa2..84526a1fca0f9 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -54,9 +54,10 @@ entry:
define i1 @usub(i32 %a, i32 %b, ptr %c) nounwind {
; RV32I-LABEL: usub:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
-; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: sltu a3, a1, a0
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: mv a0, a3
; RV32I-NEXT: ret
entry:
%x = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index ea9786d0b10b3..f5f122a8c9dd7 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -715,7 +715,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
; RV32I-NEXT: zext.b a0, a3
; RV32I-NEXT: sub a1, a0, s1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sltu a0, s1, a0
; RV32I-NEXT: addi a0, a0, -1
; RV32I-NEXT: and a2, a0, a1
; RV32I-NEXT: sb a3, 3(sp)
@@ -755,7 +755,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV32IA-NEXT: srl a4, a4, a0
; RV32IA-NEXT: zext.b a4, a4
; RV32IA-NEXT: sub a6, a4, a1
-; RV32IA-NEXT: sltu a4, a4, a6
+; RV32IA-NEXT: sltu a4, a1, a4
; RV32IA-NEXT: addi a4, a4, -1
; RV32IA-NEXT: and a4, a4, a6
; RV32IA-NEXT: sll a4, a4, a0
@@ -792,7 +792,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: zext.b a0, a3
; RV64I-NEXT: sub a1, a0, s1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sltu a0, s1, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: sb a3, 7(sp)
@@ -832,7 +832,7 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; RV64IA-NEXT: sext.w a6, a3
; RV64IA-NEXT: zext.b a5, a5
; RV64IA-NEXT: sub a7, a5, a1
-; RV64IA-NEXT: sltu a5, a5, a7
+; RV64IA-NEXT: sltu a5, a1, a5
; RV64IA-NEXT: addi a5, a5, -1
; RV64IA-NEXT: and a5, a5, a7
; RV64IA-NEXT: sllw a5, a5, a0
@@ -877,7 +877,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
; RV32I-NEXT: and a0, a3, s1
; RV32I-NEXT: sub a1, a0, s2
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sltu a0, s2, a0
; RV32I-NEXT: addi a0, a0, -1
; RV32I-NEXT: and a2, a0, a1
; RV32I-NEXT: sh a3, 14(sp)
@@ -920,7 +920,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV32IA-NEXT: srl a5, a5, a0
; RV32IA-NEXT: and a5, a5, a3
; RV32IA-NEXT: sub a7, a5, a1
-; RV32IA-NEXT: sltu a5, a5, a7
+; RV32IA-NEXT: sltu a5, a1, a5
; RV32IA-NEXT: addi a5, a5, -1
; RV32IA-NEXT: and a5, a5, a7
; RV32IA-NEXT: sll a5, a5, a0
@@ -961,7 +961,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: and a0, a3, s1
; RV64I-NEXT: sub a1, a0, s2
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sltu a0, s2, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: sh a3, 14(sp)
@@ -1004,7 +1004,7 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; RV64IA-NEXT: sext.w a7, a4
; RV64IA-NEXT: and a6, a6, a3
; RV64IA-NEXT: sub t0, a6, a1
-; RV64IA-NEXT: sltu a6, a6, t0
+; RV64IA-NEXT: sltu a6, a1, a6
; RV64IA-NEXT: addi a6, a6, -1
; RV64IA-NEXT: and a6, a6, t0
; RV64IA-NEXT: sllw a6, a6, a0
@@ -1044,7 +1044,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; RV32I-NEXT: .LBB6_1: # %atomicrmw.start
; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
; RV32I-NEXT: sub a0, a3, s1
-; RV32I-NEXT: sltu a1, a3, a0
+; RV32I-NEXT: sltu a1, s1, a3
; RV32I-NEXT: addi a1, a1, -1
; RV32I-NEXT: and a2, a1, a0
; RV32I-NEXT: sw a3, 0(sp)
@@ -1075,7 +1075,7 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; RV32IA-NEXT: # Child Loop BB6_3 Depth 2
; RV32IA-NEXT: mv a3, a2
; RV32IA-NEXT: sub a2, a2, a1
-; RV32IA-NEXT: sltu a4, a3, a2
+; RV32IA-NEXT: sltu a4, a1, a3
; RV32IA-NEXT: addi a4, a4, -1
; RV32IA-NEXT: and a4, a4, a2
; RV32IA-NEXT: .LBB6_3: # %atomicrmw.start
@@ -1298,7 +1298,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; RV64I-NEXT: .LBB7_1: # %atomicrmw.start
; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
; RV64I-NEXT: sub a0, a3, s1
-; RV64I-NEXT: sltu a1, a3, a0
+; RV64I-NEXT: sltu a1, s1, a3
; RV64I-NEXT: addi a1, a1, -1
; RV64I-NEXT: and a2, a1, a0
; RV64I-NEXT: sd a3, 0(sp)
@@ -1329,7 +1329,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; RV64IA-NEXT: # Child Loop BB7_3 Depth 2
; RV64IA-NEXT: mv a3, a2
; RV64IA-NEXT: sub a2, a2, a1
-; RV64IA-NEXT: sltu a4, a3, a2
+; RV64IA-NEXT: sltu a4, a1, a3
; RV64IA-NEXT: addi a4, a4, -1
; RV64IA-NEXT: and a4, a4, a2
; RV64IA-NEXT: .LBB7_3: # %atomicrmw.start
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 5b215c5173211..0fb4b2a06b76f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -519,7 +519,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -543,7 +543,7 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64_unmasked(<vscale x 16 x i64> %va, i3
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 09b8fdbf11d26..025f944bcd51c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -3018,7 +3018,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
; CHECK-NEXT: slli a3, a3, 2
; CHECK-NEXT: vslidedown.vx v0, v0, a4
; CHECK-NEXT: sub a4, a0, a3
-; CHECK-NEXT: sltu a5, a0, a4
+; CHECK-NEXT: sltu a5, a3, a0
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a5, a5, a4
; CHECK-NEXT: lui a6, 5
@@ -3079,7 +3079,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
; CHECK-ZVBB-NEXT: slli a1, a1, 2
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -3104,7 +3104,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
; CHECK-NEXT: lui a2, 3
; CHECK-NEXT: slli a3, a3, 2
; CHECK-NEXT: sub a4, a0, a3
-; CHECK-NEXT: sltu a5, a0, a4
+; CHECK-NEXT: sltu a5, a3, a0
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a5, a5, a4
; CHECK-NEXT: lui a6, 5
@@ -3160,7 +3160,7 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16>
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: slli a1, a1, 2
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0177b8cfd4393..668a770610f20 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1534,7 +1534,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -1561,7 +1561,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
; CHECK-ZVKB-NEXT: slli a1, a1, 2
; CHECK-ZVKB-NEXT: vslidedown.vx v0, v0, a2
; CHECK-ZVKB-NEXT: sub a2, a0, a1
-; CHECK-ZVKB-NEXT: sltu a3, a0, a2
+; CHECK-ZVKB-NEXT: sltu a3, a1, a0
; CHECK-ZVKB-NEXT: addi a3, a3, -1
; CHECK-ZVKB-NEXT: and a2, a3, a2
; CHECK-ZVKB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -1584,7 +1584,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
@@ -1606,7 +1606,7 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va,
; CHECK-ZVKB-NEXT: csrr a1, vlenb
; CHECK-ZVKB-NEXT: slli a1, a1, 2
; CHECK-ZVKB-NEXT: sub a2, a0, a1
-; CHECK-ZVKB-NEXT: sltu a3, a0, a2
+; CHECK-ZVKB-NEXT: sltu a3, a1, a0
; CHECK-ZVKB-NEXT: addi a3, a3, -1
; CHECK-ZVKB-NEXT: and a2, a3, a2
; CHECK-ZVKB-NEXT: vsetvli zero, a2, e16, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 6c7709f52e30b..d3813b703c5be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1585,7 +1585,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
; RV32ZFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZFH-NEXT: sub a2, a0, a1
; RV32ZFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZFH-NEXT: sltu a3, a0, a2
+; RV32ZFH-NEXT: sltu a3, a1, a0
; RV32ZFH-NEXT: addi a3, a3, -1
; RV32ZFH-NEXT: and a2, a3, a2
; RV32ZFH-NEXT: vmv1r.v v0, v6
@@ -1631,7 +1631,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
; RV64ZFH-NEXT: sub a3, a0, a1
; RV64ZFH-NEXT: slli a2, a2, 52
; RV64ZFH-NEXT: fmv.d.x fa5, a2
-; RV64ZFH-NEXT: sltu a2, a0, a3
+; RV64ZFH-NEXT: sltu a2, a1, a0
; RV64ZFH-NEXT: addi a2, a2, -1
; RV64ZFH-NEXT: and a2, a2, a3
; RV64ZFH-NEXT: vmv1r.v v0, v6
@@ -1676,7 +1676,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
; RV32ZFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZFH-NEXT: sub a3, a0, a1
; RV32ZFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZFH-NEXT: sltu a2, a0, a3
+; RV32ZFH-NEXT: sltu a2, a1, a0
; RV32ZFH-NEXT: addi a2, a2, -1
; RV32ZFH-NEXT: and a2, a2, a3
; RV32ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1710,7 +1710,7 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
; RV64ZFH-NEXT: sub a3, a0, a1
; RV64ZFH-NEXT: slli a2, a2, 52
; RV64ZFH-NEXT: fmv.d.x fa5, a2
-; RV64ZFH-NEXT: sltu a2, a0, a3
+; RV64ZFH-NEXT: sltu a2, a1, a0
; RV64ZFH-NEXT: addi a2, a2, -1
; RV64ZFH-NEXT: and a2, a2, a3
; RV64ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 20f397b694180..f8293f6c671f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -1195,7 +1195,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: sub a5, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: sltu a3, a0, a5
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a5, a3, a5
; CHECK-NEXT: li a3, 1086
@@ -1228,7 +1228,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1252,7 +1252,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; CHECK-NEXT: fsrmi a4, 1
; CHECK-NEXT: li a2, 52
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: sltu a5, a0, a3
+; CHECK-NEXT: sltu a5, a1, a0
; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: and a5, a5, a3
; CHECK-NEXT: li a3, 1086
@@ -1280,7 +1280,7 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2402,7 +2402,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a4
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a4, a2, a4
; CHECK-NEXT: li a2, 52
@@ -2433,7 +2433,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2456,7 +2456,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: fsrmi a3, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a4, a4, a2
; CHECK-NEXT: li a2, 52
@@ -2482,7 +2482,7 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 1bbefc65d3e39..d16418f57033a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1990,7 +1990,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: sltu a2, a0, a3
+; RV32-NEXT: sltu a2, a1, a0
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2117,10 +2117,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sub a6, a0, a1
+; RV64-NEXT: sltu a1, a1, a0
+; RV64-NEXT: li a0, 56
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: and a1, a1, a6
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -2129,11 +2134,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: li a6, 56
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a1
; RV64-NEXT: vand.vx v24, v24, a2, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
; RV64-NEXT: vand.vx v24, v8, a3, v0.t
@@ -2144,9 +2144,9 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: vadd.vv v8, v8, v24, v0.t
; RV64-NEXT: vand.vx v8, v8, a4, v0.t
; RV64-NEXT: vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a2, v0.t
; RV64-NEXT: vsub.vv v16, v16, v24, v0.t
@@ -2158,7 +2158,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: vadd.vv v16, v16, v24, v0.t
; RV64-NEXT: vand.vx v16, v16, a4, v0.t
; RV64-NEXT: vmul.vx v16, v16, a5, v0.t
-; RV64-NEXT: vsrl.vx v16, v16, a6, v0.t
+; RV64-NEXT: vsrl.vx v16, v16, a0, v0.t
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64:
@@ -2169,7 +2169,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2200,10 +2200,10 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: sub a4, a0, a1
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: sltu a2, a0, a4
+; RV32-NEXT: sltu a2, a1, a0
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a4
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2308,10 +2308,15 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV64-NEXT: lui a4, 209715
; RV64-NEXT: lui a5, 61681
; RV64-NEXT: lui a6, 4112
+; RV64-NEXT: sub a7, a0, a2
+; RV64-NEXT: sltu a0, a2, a0
+; RV64-NEXT: li a2, 56
; RV64-NEXT: addi a3, a3, 1365
; RV64-NEXT: addi a4, a4, 819
; RV64-NEXT: addi a5, a5, -241
; RV64-NEXT: addi a6, a6, 257
+; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: and a0, a0, a7
; RV64-NEXT: slli a7, a3, 32
; RV64-NEXT: add a3, a3, a7
; RV64-NEXT: slli a7, a4, 32
@@ -2320,11 +2325,6 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV64-NEXT: add a5, a5, a7
; RV64-NEXT: slli a7, a6, 32
; RV64-NEXT: add a6, a6, a7
-; RV64-NEXT: li a7, 56
-; RV64-NEXT: sub a2, a0, a2
-; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: vand.vx v24, v24, a3
; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -2346,26 +2346,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a5
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a5
+; RV64-NEXT: vmul.vx v16, v16, a6
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v8, v8, a7
+; RV64-NEXT: vsrl.vx v8, v8, a2
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v16, v16, a6
-; RV64-NEXT: vsrl.vx v16, v16, a7
+; RV64-NEXT: vsrl.vx v16, v16, a2
; RV64-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index c82ad17545a6a..464c4d1f5f899 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2154,7 +2154,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: sub a3, a0, a1
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: sltu a2, a0, a3
+; RV32-NEXT: sltu a2, a1, a0
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: lui a3, 349525
@@ -2190,31 +2190,31 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: vand.vv v8, v8, v24, v0.t
; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi a3, a3, -241
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v8, v0.t
+; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a2, 56
-; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t
+; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: bltu a0, a1, .LBB46_2
@@ -2226,11 +2226,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vi v16, v8, -1, v0.t
-; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vadd.vi v8, v16, -1, v0.t
+; RV32-NEXT: vnot.v v16, v16, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
@@ -2286,11 +2286,14 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: lui a5, 4112
; RV64-NEXT: srli a6, a1, 3
; RV64-NEXT: sub a7, a0, a1
+; RV64-NEXT: vslidedown.vx v0, v0, a6
+; RV64-NEXT: sltu a6, a1, a0
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi t0, a5, 257
-; RV64-NEXT: vslidedown.vx v0, v0, a6
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: and a7, a6, a7
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a6, a2, a6
; RV64-NEXT: slli a5, a3, 32
@@ -2299,9 +2302,6 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; RV64-NEXT: add a2, a4, a2
; RV64-NEXT: slli a3, t0, 32
; RV64-NEXT: add a3, t0, a3
-; RV64-NEXT: sltu a4, a0, a7
-; RV64-NEXT: addi a4, a4, -1
-; RV64-NEXT: and a7, a4, a7
; RV64-NEXT: li a4, 56
; RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1, v0.t
@@ -2350,7 +2350,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2381,10 +2381,10 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: sub a4, a0, a1
; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: sltu a2, a0, a4
+; RV32-NEXT: sltu a2, a1, a0
+; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a2, a2, a4
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2489,21 +2489,21 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
; RV64-NEXT: sub a6, a0, a1
+; RV64-NEXT: sltu a7, a1, a0
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
-; RV64-NEXT: addi a7, a4, -241
-; RV64-NEXT: addi t0, a5, 257
+; RV64-NEXT: addi t0, a4, -241
+; RV64-NEXT: addi t1, a5, 257
+; RV64-NEXT: addi a7, a7, -1
+; RV64-NEXT: and a6, a7, a6
; RV64-NEXT: slli a5, a2, 32
; RV64-NEXT: add a5, a2, a5
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a4, a3, a4
-; RV64-NEXT: slli a2, a7, 32
-; RV64-NEXT: add a2, a7, a2
-; RV64-NEXT: slli a3, t0, 32
-; RV64-NEXT: add a3, t0, a3
-; RV64-NEXT: sltu a7, a0, a6
-; RV64-NEXT: addi a7, a7, -1
-; RV64-NEXT: and a6, a7, a6
+; RV64-NEXT: slli a2, t0, 32
+; RV64-NEXT: add a2, t0, a2
+; RV64-NEXT: slli a3, t1, 32
+; RV64-NEXT: add a3, t1, a3
; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1
; RV64-NEXT: vnot.v v16, v16
@@ -2547,7 +2547,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -3731,7 +3731,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a4
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a4, a2, a4
; CHECK-NEXT: li a2, 52
@@ -3766,7 +3766,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
; CHECK-ZVBB-NEXT: srli a2, a1, 3
; CHECK-ZVBB-NEXT: sub a3, a0, a1
; CHECK-ZVBB-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT: sltu a2, a0, a3
+; CHECK-ZVBB-NEXT: sltu a2, a1, a0
; CHECK-ZVBB-NEXT: addi a2, a2, -1
; CHECK-ZVBB-NEXT: and a2, a2, a3
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -3789,7 +3789,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: fsrmi a3, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a4, a0, a2
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a4, a4, a2
; CHECK-NEXT: li a2, 52
@@ -3819,7 +3819,7 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
; CHECK-ZVBB: # %bb.0:
; CHECK-ZVBB-NEXT: csrr a1, vlenb
; CHECK-ZVBB-NEXT: sub a2, a0, a1
-; CHECK-ZVBB-NEXT: sltu a3, a0, a2
+; CHECK-ZVBB-NEXT: sltu a3, a1, a0
; CHECK-ZVBB-NEXT: addi a3, a3, -1
; CHECK-ZVBB-NEXT: and a2, a3, a2
; CHECK-ZVBB-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
index fa81e1f6f3514..912a63b09f1a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
@@ -392,10 +392,10 @@ define <32 x i64> @vp_abs_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl)
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v8, 0, v0.t
; CHECK-NEXT: vmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v16, 0, v0.t
@@ -417,10 +417,10 @@ define <32 x i64> @vp_abs_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v8, 0
; CHECK-NEXT: vmax.vv v8, v8, v24
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vrsub.vi v24, v16, 0
; CHECK-NEXT: vmax.vv v16, v16, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index f436bbb9a66ca..8e322b64ef551 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -2386,10 +2386,10 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
; CHECK-NEXT: vsrl.vi v24, v8, 8, v0.t
; CHECK-NEXT: lui a1, 1
; CHECK-NEXT: lui a2, 3
-; CHECK-NEXT: addi a3, a0, -64
-; CHECK-NEXT: sltu a0, a0, a3
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a3, a0, a3
+; CHECK-NEXT: sltiu a3, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a3, a3, a0
; CHECK-NEXT: lui a0, 5
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
; CHECK-NEXT: addi a1, a1, -241
@@ -2450,10 +2450,10 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext
; CHECK-NEXT: vsll.vi v8, v8, 8
; CHECK-NEXT: lui a2, 1
; CHECK-NEXT: lui a3, 3
-; CHECK-NEXT: addi a4, a0, -64
-; CHECK-NEXT: sltu a0, a0, a4
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a4
+; CHECK-NEXT: sltiu a4, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a4, a4
+; CHECK-NEXT: and a0, a4, a0
; CHECK-NEXT: lui a4, 5
; CHECK-NEXT: vor.vv v8, v8, v24
; CHECK-NEXT: addi a2, a2, -241
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index eca94ccb9bf7f..c1c9e581decf8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -1275,10 +1275,10 @@ define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext
; CHECK-NEXT: vsrl.vi v24, v8, 8, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t
; CHECK-NEXT: vor.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a1, a0, -64
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vsrl.vi v24, v16, 8, v0.t
@@ -1302,10 +1302,10 @@ define <128 x i16> @vp_bswap_v128i16_unmasked(<128 x i16> %va, i32 zeroext %evl)
; CHECK-NEXT: vsrl.vi v24, v8, 8
; CHECK-NEXT: vsll.vi v8, v8, 8
; CHECK-NEXT: vor.vv v8, v8, v24
-; CHECK-NEXT: addi a1, a0, -64
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; CHECK-NEXT: vsrl.vi v24, v16, 8
; CHECK-NEXT: vsll.vi v16, v16, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 466d5d4b8e80a..b58de7abf0442 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 3
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 3
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 3
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 3
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 3
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 3
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 00c36cb7f7327..d1fadc962c2eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1979,10 +1979,10 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
@@ -2065,22 +2065,22 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
-; RV64-NEXT: addi a6, a4, -241
-; RV64-NEXT: addi a7, a5, 257
+; RV64-NEXT: addi a7, a4, -241
+; RV64-NEXT: addi t0, a5, 257
+; RV64-NEXT: neg a4, a6
+; RV64-NEXT: and a6, a4, a0
; RV64-NEXT: slli a5, a2, 32
; RV64-NEXT: add a5, a2, a5
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a4, a3, a4
-; RV64-NEXT: slli a2, a6, 32
-; RV64-NEXT: add a2, a6, a2
-; RV64-NEXT: slli a3, a7, 32
-; RV64-NEXT: add a3, a7, a3
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
+; RV64-NEXT: slli a2, a7, 32
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: slli a3, t0, 32
+; RV64-NEXT: add a3, t0, a3
; RV64-NEXT: li a0, 56
; RV64-NEXT: vor.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 2, v0.t
@@ -2150,9 +2150,9 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: li a2, 32
@@ -2160,110 +2160,102 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsrl.vi v0, v8, 2
; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 1
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vsrl.vi v0, v8, 4
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 16
+; RV32-NEXT: vsrl.vi v0, v8, 8
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 4
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vsrl.vi v0, v8, 16
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vx v0, v16, a2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
-; RV32-NEXT: vor.vv v24, v16, v8
+; RV32-NEXT: vnot.v v0, v16
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v0, v0, v8
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v24, v24
-; RV32-NEXT: vsrl.vi v0, v24, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v0, v0, v16
-; RV32-NEXT: vsub.vv v24, v24, v0
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
+; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v8, v24, v8
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -2285,95 +2277,95 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: lui a4, 209715
; RV64-NEXT: lui a5, 61681
; RV64-NEXT: lui a6, 4112
-; RV64-NEXT: addi a7, a3, 1365
-; RV64-NEXT: addi a3, a4, 819
-; RV64-NEXT: addi a4, a5, -241
-; RV64-NEXT: addi a6, a6, 257
-; RV64-NEXT: slli a5, a7, 32
-; RV64-NEXT: add a7, a7, a5
-; RV64-NEXT: slli a5, a3, 32
-; RV64-NEXT: add a5, a3, a5
-; RV64-NEXT: slli a3, a4, 32
-; RV64-NEXT: add a3, a4, a3
-; RV64-NEXT: slli a4, a6, 32
-; RV64-NEXT: add a4, a6, a4
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
-; RV64-NEXT: li a0, 56
+; RV64-NEXT: sltiu a7, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: addi a3, a3, 1365
+; RV64-NEXT: addi a4, a4, 819
+; RV64-NEXT: addi a5, a5, -241
+; RV64-NEXT: addi t0, a6, 257
+; RV64-NEXT: neg a6, a7
+; RV64-NEXT: and a0, a6, a0
+; RV64-NEXT: slli a6, a3, 32
+; RV64-NEXT: add a7, a3, a6
+; RV64-NEXT: slli a6, a4, 32
+; RV64-NEXT: add a6, a4, a6
+; RV64-NEXT: slli a3, a5, 32
+; RV64-NEXT: add a3, a5, a3
+; RV64-NEXT: slli a4, t0, 32
+; RV64-NEXT: add a4, t0, a4
+; RV64-NEXT: li a5, 56
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 2
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 8
+; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 16
+; RV64-NEXT: vsrl.vi v24, v8, 8
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vsrl.vi v24, v8, 16
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 8
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vor.vv v8, v8, v24
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 16
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vnot.v v8, v8
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vor.vv v16, v16, v24
+; RV64-NEXT: vsrl.vx v24, v16, a2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v16, a2
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a5
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v8, v8, a5
+; RV64-NEXT: vand.vx v24, v8, a6
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a6
; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v16, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v16, a6
; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vx v16, v16, a6
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a5
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v8, v8, a4
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vmul.vx v8, v8, a4
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v8, v8, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a3
; RV64-NEXT: vmul.vx v16, v16, a4
-; RV64-NEXT: vsrl.vx v16, v16, a0
+; RV64-NEXT: vsrl.vx v16, v16, a5
; RV64-NEXT: ret
%v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x i64> %v
@@ -4354,10 +4346,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: li a4, 40
@@ -4440,22 +4432,22 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
-; RV64-NEXT: addi a6, a4, -241
-; RV64-NEXT: addi a7, a5, 257
+; RV64-NEXT: addi a7, a4, -241
+; RV64-NEXT: addi t0, a5, 257
+; RV64-NEXT: neg a4, a6
+; RV64-NEXT: and a6, a4, a0
; RV64-NEXT: slli a5, a2, 32
; RV64-NEXT: add a5, a2, a5
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a4, a3, a4
-; RV64-NEXT: slli a2, a6, 32
-; RV64-NEXT: add a2, a6, a2
-; RV64-NEXT: slli a3, a7, 32
-; RV64-NEXT: add a3, a7, a3
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
+; RV64-NEXT: slli a2, a7, 32
+; RV64-NEXT: add a2, a7, a2
+; RV64-NEXT: slli a3, t0, 32
+; RV64-NEXT: add a3, t0, a3
; RV64-NEXT: li a0, 56
; RV64-NEXT: vor.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 2, v0.t
@@ -4525,9 +4517,9 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: li a2, 32
@@ -4535,110 +4527,102 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi a3, a3, 1365
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: sltiu a3, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a0, a3, a0
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsrl.vi v0, v8, 2
; RV32-NEXT: vor.vv v8, v8, v0
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 1
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vsrl.vi v0, v8, 4
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 16
+; RV32-NEXT: vsrl.vi v0, v8, 8
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 4
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vsrl.vi v0, v8, 16
; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 8
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsrl.vx v0, v8, a2
+; RV32-NEXT: vor.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v16, 16
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vnot.v v8, v8
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vor.vv v16, v16, v0
+; RV32-NEXT: vsrl.vx v0, v16, a2
; RV32-NEXT: vor.vv v16, v16, v0
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: vand.vv v0, v0, v24
-; RV32-NEXT: vsub.vv v0, v8, v0
+; RV32-NEXT: vsub.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
-; RV32-NEXT: vor.vv v24, v16, v8
+; RV32-NEXT: vnot.v v0, v16
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: vmv.v.x v16, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v0, v8
-; RV32-NEXT: vsrl.vi v0, v0, 2
-; RV32-NEXT: vand.vv v0, v0, v8
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vand.vv v24, v8, v16
+; RV32-NEXT: vsrl.vi v8, v8, 2
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vadd.vv v8, v24, v8
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vnot.v v24, v24
-; RV32-NEXT: vsrl.vi v0, v24, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v0, v0, v16
-; RV32-NEXT: vsub.vv v24, v24, v0
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v16, 4
-; RV32-NEXT: vadd.vv v16, v16, v0
+; RV32-NEXT: vsrl.vi v0, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v0, v24, v8
+; RV32-NEXT: vand.vv v0, v24, v16
; RV32-NEXT: vsrl.vi v24, v24, 2
-; RV32-NEXT: vand.vv v8, v24, v8
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: lui a3, 4112
; RV32-NEXT: addi a2, a2, -241
; RV32-NEXT: addi a3, a3, 257
-; RV32-NEXT: vadd.vv v8, v0, v8
-; RV32-NEXT: vsrl.vi v24, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vadd.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v24, v16, 4
+; RV32-NEXT: vadd.vv v16, v16, v24
; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a3
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v16, v16, v24
; RV32-NEXT: li a2, 56
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v8, v16, a2
+; RV32-NEXT: vsrl.vx v8, v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vx v16, v24, a2
+; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -4660,95 +4644,95 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV64-NEXT: lui a4, 209715
; RV64-NEXT: lui a5, 61681
; RV64-NEXT: lui a6, 4112
-; RV64-NEXT: addi a7, a3, 1365
-; RV64-NEXT: addi a3, a4, 819
-; RV64-NEXT: addi a4, a5, -241
-; RV64-NEXT: addi a6, a6, 257
-; RV64-NEXT: slli a5, a7, 32
-; RV64-NEXT: add a7, a7, a5
-; RV64-NEXT: slli a5, a3, 32
-; RV64-NEXT: add a5, a3, a5
-; RV64-NEXT: slli a3, a4, 32
-; RV64-NEXT: add a3, a4, a3
-; RV64-NEXT: slli a4, a6, 32
-; RV64-NEXT: add a4, a6, a4
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a6, a0, a6
-; RV64-NEXT: li a0, 56
+; RV64-NEXT: sltiu a7, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: addi a3, a3, 1365
+; RV64-NEXT: addi a4, a4, 819
+; RV64-NEXT: addi a5, a5, -241
+; RV64-NEXT: addi t0, a6, 257
+; RV64-NEXT: neg a6, a7
+; RV64-NEXT: and a0, a6, a0
+; RV64-NEXT: slli a6, a3, 32
+; RV64-NEXT: add a7, a3, a6
+; RV64-NEXT: slli a6, a4, 32
+; RV64-NEXT: add a6, a4, a6
+; RV64-NEXT: slli a3, a5, 32
+; RV64-NEXT: add a3, a5, a3
+; RV64-NEXT: slli a4, t0, 32
+; RV64-NEXT: add a4, t0, a4
+; RV64-NEXT: li a5, 56
; RV64-NEXT: vor.vv v8, v8, v24
; RV64-NEXT: vsrl.vi v24, v8, 2
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 8
+; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v8, 16
+; RV64-NEXT: vsrl.vi v24, v8, 8
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vsrl.vi v24, v8, 16
; RV64-NEXT: vor.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 8
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v24, v8, a2
+; RV64-NEXT: vor.vv v8, v8, v24
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 16
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vnot.v v8, v8
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vor.vv v16, v16, v24
+; RV64-NEXT: vsrl.vx v24, v16, a2
; RV64-NEXT: vor.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v24, v16, a2
-; RV64-NEXT: vor.vv v16, v16, v24
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a5
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v8, v8, a5
+; RV64-NEXT: vand.vx v24, v8, a6
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a6
; RV64-NEXT: vadd.vv v8, v24, v8
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a7
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v16, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v16, a6
; RV64-NEXT: vsrl.vi v16, v16, 2
+; RV64-NEXT: vand.vx v16, v16, a6
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a5
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v8, v8, a4
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT: vmul.vx v8, v8, a4
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vsrl.vx v8, v8, a5
+; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a3
; RV64-NEXT: vmul.vx v16, v16, a4
-; RV64-NEXT: vsrl.vx v16, v16, a0
+; RV64-NEXT: vsrl.vx v16, v16, a5
; RV64-NEXT: ret
%v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index f56438bf87e6a..61bc86333d95f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1430,7 +1430,10 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v8, v24, v0.t
@@ -1455,24 +1458,21 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a2, a2, 257
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: addi a2, sp, 16
; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
@@ -1481,15 +1481,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v8, v24, v8, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
@@ -1504,15 +1507,12 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -1541,10 +1541,14 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV64-NEXT: lui a2, 209715
; RV64-NEXT: lui a3, 61681
; RV64-NEXT: lui a4, 4112
+; RV64-NEXT: sltiu a5, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a1, a1, 1365
; RV64-NEXT: addi a2, a2, 819
; RV64-NEXT: addi a3, a3, -241
; RV64-NEXT: addi a4, a4, 257
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a0, a5, a0
; RV64-NEXT: slli a5, a1, 32
; RV64-NEXT: add a1, a1, a5
; RV64-NEXT: slli a5, a2, 32
@@ -1553,10 +1557,6 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; RV64-NEXT: add a3, a3, a5
; RV64-NEXT: slli a5, a4, 32
; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: addi a5, a0, -16
-; RV64-NEXT: sltu a0, a0, a5
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a5
; RV64-NEXT: li a5, 56
; RV64-NEXT: vand.vx v24, v24, a1, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
@@ -1603,10 +1603,10 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: addi a2, a2, 819
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1628,13 +1628,13 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vand.vv v24, v16, v0
; RV32-NEXT: vsrl.vi v16, v16, 2
; RV32-NEXT: vand.vv v16, v16, v0
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
; RV32-NEXT: lui a2, 61681
; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vadd.vv v16, v24, v16
+; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v24, v16, 4
; RV32-NEXT: vadd.vv v16, v16, v24
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
@@ -1672,10 +1672,14 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a0, a6, a0
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -1684,10 +1688,6 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
; RV64-NEXT: li a6, 56
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vsub.vv v8, v8, v24
@@ -1710,18 +1710,18 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
+; RV64-NEXT: vand.vx v16, v16, a4
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v16, v16, a4
+; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsrl.vx v16, v16, a6
; RV64-NEXT: ret
%v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index 098384d200045..0e3eadcce484e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1604,10 +1604,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
@@ -1616,26 +1616,26 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT: vadd.vi v16, v8, -1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
@@ -1679,31 +1679,31 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: lui a2, 209715
; RV64-NEXT: lui a3, 61681
; RV64-NEXT: lui a4, 4112
+; RV64-NEXT: sltiu a5, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a1, a1, 1365
; RV64-NEXT: addi a2, a2, 819
-; RV64-NEXT: addi a5, a3, -241
+; RV64-NEXT: addi a3, a3, -241
; RV64-NEXT: addi a4, a4, 257
-; RV64-NEXT: slli a3, a1, 32
-; RV64-NEXT: add a6, a1, a3
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a3, a2, a3
-; RV64-NEXT: slli a1, a5, 32
-; RV64-NEXT: add a1, a5, a1
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a5, a5, a0
+; RV64-NEXT: slli a0, a1, 32
+; RV64-NEXT: add a6, a1, a0
+; RV64-NEXT: slli a0, a2, 32
+; RV64-NEXT: add a7, a2, a0
+; RV64-NEXT: slli a1, a3, 32
+; RV64-NEXT: add a1, a3, a1
; RV64-NEXT: slli a2, a4, 32
; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: addi a4, a0, -16
-; RV64-NEXT: sltu a0, a0, a4
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a4, a0, a4
; RV64-NEXT: li a0, 56
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT: vand.vx v24, v8, a3, v0.t
+; RV64-NEXT: vand.vx v24, v8, a7, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: vand.vx v8, v8, a7, v0.t
; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v24, v0.t
@@ -1711,16 +1711,16 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; RV64-NEXT: vmul.vx v8, v8, a2, v0.t
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a5, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1, v0.t
; RV64-NEXT: vnot.v v16, v16, v0.t
; RV64-NEXT: vand.vv v16, v16, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT: vand.vx v24, v16, a3, v0.t
+; RV64-NEXT: vand.vx v24, v16, a7, v0.t
; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT: vand.vx v16, v16, a3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a7, v0.t
; RV64-NEXT: vadd.vv v16, v24, v16, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV64-NEXT: vadd.vv v16, v16, v24, v0.t
@@ -1744,9 +1744,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vadd.vi v24, v8, -1
; RV32-NEXT: vnot.v v0, v8
@@ -1754,15 +1754,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v0, v24
@@ -1774,8 +1769,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vadd.vi v0, v16, -1
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v0, v16, v0
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1783,16 +1780,9 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v0, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 4
@@ -1826,7 +1816,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -1848,10 +1838,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a0, a6, a0
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -1860,47 +1854,43 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
; RV64-NEXT: li a6, 56
; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vand.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a3
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vx v24, v24, a2
+; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a3
+; RV64-NEXT: vadd.vv v8, v24, v8
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vand.vx v24, v16, a3
; RV64-NEXT: vsrl.vi v16, v16, 2
; RV64-NEXT: vand.vx v16, v16, a3
+; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a4
; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsrl.vx v16, v16, a6
@@ -3509,10 +3499,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a3, 40
@@ -3521,26 +3511,26 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vi v24, v8, -1, v0.t
+; RV32-NEXT: vadd.vi v16, v8, -1, v0.t
; RV32-NEXT: vnot.v v8, v8, v0.t
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v8, v24, v0.t
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
-; RV32-NEXT: vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
@@ -3584,31 +3574,31 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: lui a2, 209715
; RV64-NEXT: lui a3, 61681
; RV64-NEXT: lui a4, 4112
+; RV64-NEXT: sltiu a5, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a1, a1, 1365
; RV64-NEXT: addi a2, a2, 819
-; RV64-NEXT: addi a5, a3, -241
+; RV64-NEXT: addi a3, a3, -241
; RV64-NEXT: addi a4, a4, 257
-; RV64-NEXT: slli a3, a1, 32
-; RV64-NEXT: add a6, a1, a3
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a3, a2, a3
-; RV64-NEXT: slli a1, a5, 32
-; RV64-NEXT: add a1, a5, a1
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a5, a5, a0
+; RV64-NEXT: slli a0, a1, 32
+; RV64-NEXT: add a6, a1, a0
+; RV64-NEXT: slli a0, a2, 32
+; RV64-NEXT: add a7, a2, a0
+; RV64-NEXT: slli a1, a3, 32
+; RV64-NEXT: add a1, a3, a1
; RV64-NEXT: slli a2, a4, 32
; RV64-NEXT: add a2, a4, a2
-; RV64-NEXT: addi a4, a0, -16
-; RV64-NEXT: sltu a0, a0, a4
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a4, a0, a4
; RV64-NEXT: li a0, 56
; RV64-NEXT: vnot.v v8, v8, v0.t
; RV64-NEXT: vand.vv v8, v8, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v8, v8, v24, v0.t
-; RV64-NEXT: vand.vx v24, v8, a3, v0.t
+; RV64-NEXT: vand.vx v24, v8, a7, v0.t
; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT: vand.vx v8, v8, a3, v0.t
+; RV64-NEXT: vand.vx v8, v8, a7, v0.t
; RV64-NEXT: vadd.vv v8, v24, v8, v0.t
; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t
; RV64-NEXT: vadd.vv v8, v8, v24, v0.t
@@ -3616,16 +3606,16 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
; RV64-NEXT: vmul.vx v8, v8, a2, v0.t
; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t
; RV64-NEXT: vmv1r.v v0, v7
-; RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; RV64-NEXT: vsetvli zero, a5, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1, v0.t
; RV64-NEXT: vnot.v v16, v16, v0.t
; RV64-NEXT: vand.vv v16, v16, v24, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t
; RV64-NEXT: vand.vx v24, v24, a6, v0.t
; RV64-NEXT: vsub.vv v16, v16, v24, v0.t
-; RV64-NEXT: vand.vx v24, v16, a3, v0.t
+; RV64-NEXT: vand.vx v24, v16, a7, v0.t
; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT: vand.vx v16, v16, a3, v0.t
+; RV64-NEXT: vand.vx v16, v16, a7, v0.t
; RV64-NEXT: vadd.vv v16, v24, v16, v0.t
; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t
; RV64-NEXT: vadd.vv v16, v16, v24, v0.t
@@ -3649,9 +3639,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vadd.vi v24, v8, -1
; RV32-NEXT: vnot.v v0, v8
@@ -3659,15 +3649,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v0, v24
@@ -3679,8 +3664,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vadd.vi v0, v16, -1
; RV32-NEXT: vnot.v v16, v16
; RV32-NEXT: vand.vv v0, v16, v0
+; RV32-NEXT: vsrl.vi v16, v0, 1
+; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -3688,16 +3675,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v24, v24, v16
; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v0, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsub.vv v24, v0, v24
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsrl.vi v0, v8, 4
@@ -3731,7 +3711,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsrl.vx v16, v16, a2
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: .cfi_def_cfa sp, 16
; RV32-NEXT: addi sp, sp, 16
@@ -3753,10 +3733,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV64-NEXT: lui a3, 209715
; RV64-NEXT: lui a4, 61681
; RV64-NEXT: lui a5, 4112
+; RV64-NEXT: sltiu a6, a0, 17
+; RV64-NEXT: addi a0, a0, -16
; RV64-NEXT: addi a2, a2, 1365
; RV64-NEXT: addi a3, a3, 819
; RV64-NEXT: addi a4, a4, -241
; RV64-NEXT: addi a5, a5, 257
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a0, a6, a0
; RV64-NEXT: slli a6, a2, 32
; RV64-NEXT: add a2, a2, a6
; RV64-NEXT: slli a6, a3, 32
@@ -3765,47 +3749,43 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
; RV64-NEXT: add a4, a4, a6
; RV64-NEXT: slli a6, a5, 32
; RV64-NEXT: add a5, a5, a6
-; RV64-NEXT: addi a6, a0, -16
-; RV64-NEXT: sltu a0, a0, a6
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a6
; RV64-NEXT: li a6, 56
; RV64-NEXT: vand.vv v8, v8, v24
-; RV64-NEXT: vsrl.vi v24, v8, 1
-; RV64-NEXT: vand.vx v24, v24, a2
-; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vadd.vi v24, v16, -1
; RV64-NEXT: vnot.v v16, v16
; RV64-NEXT: vand.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vand.vx v24, v8, a3
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a3
-; RV64-NEXT: vadd.vv v8, v24, v8
+; RV64-NEXT: vsrl.vi v24, v8, 1
+; RV64-NEXT: vand.vx v24, v24, a2
+; RV64-NEXT: vsub.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsrl.vi v24, v16, 1
; RV64-NEXT: vand.vx v24, v24, a2
; RV64-NEXT: vsub.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT: vand.vx v24, v8, a3
+; RV64-NEXT: vsrl.vi v8, v8, 2
+; RV64-NEXT: vand.vx v8, v8, a3
+; RV64-NEXT: vadd.vv v8, v24, v8
; RV64-NEXT: vsrl.vi v24, v8, 4
; RV64-NEXT: vadd.vv v8, v8, v24
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vand.vx v24, v16, a3
; RV64-NEXT: vsrl.vi v16, v16, 2
; RV64-NEXT: vand.vx v16, v16, a3
+; RV64-NEXT: vadd.vv v16, v24, v16
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vand.vx v8, v8, a4
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v24, v16
+; RV64-NEXT: vsrl.vi v24, v16, 4
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmul.vx v8, v8, a5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v24, v16, 4
+; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsrl.vx v8, v8, a6
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vv v16, v16, v24
; RV64-NEXT: vand.vx v16, v16, a4
; RV64-NEXT: vmul.vx v16, v16, a5
; RV64-NEXT: vsrl.vx v16, v16, a6
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 76f5f0a32bd1c..5a0749068b41d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 2
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 2
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 2
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 2
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 2
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 2
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 2
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 2
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index da6e2fae93687..ad7ee735707f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
}
define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmax_vv_v32f64_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: bltu a2, a1, .LBB25_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
-; CHECK-NEXT: .LBB25_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: vfmax.vv v16, v16, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%v = call <32 x double> @llvm.vp.maximum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x double> %v
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index e179970199171..9a5304e0d94e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -611,10 +611,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -657,75 +657,6 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
}
define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
-; CHECK-LABEL: vfmin_vv_v32f64_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: bltu a2, a1, .LBB25_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
-; CHECK-NEXT: .LBB25_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
-; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: vfmin.vv v8, v8, v16
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v0, v16, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vmfeq.vv v7, v8, v8
-; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0
-; CHECK-NEXT: vfmin.vv v16, v16, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%v = call <32 x double> @llvm.vp.minimum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> splat (i1 true), i32 %evl)
ret <32 x double> %v
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
index 465b166826a37..6d87ecfd3bc6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll
@@ -96,10 +96,10 @@ define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
; CHECK-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vmv1r.v v0, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
index 96eda109e1c70..044b9fefa1220 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptosi_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
index 4020100bf364b..55f4d9e0805c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll
@@ -376,10 +376,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v16, v16, v0.t
@@ -399,10 +399,10 @@ define <32 x i64> @vfptoui_v32i64_v32f64_unmasked(<32 x double> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.rtz.xu.f.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
index e509722b623a2..aab5bbdfebacd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
@@ -97,10 +97,10 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
; CHECK-NEXT: vfncvt.f.f.w v8, v24, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vfncvt.f.f.w v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 57c94830fc606..e3ed908a5bddb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -741,10 +741,10 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV32-NEXT: vfabs.v v24, v8, v0.t
; RV32-NEXT: lui a1, %hi(.LCPI26_0)
; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT: addi a1, a0, -16
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: sltiu a1, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a0, a1, a0
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32-NEXT: frflags a1
@@ -787,12 +787,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV64-NEXT: li a1, 1075
; RV64-NEXT: slli a1, a1, 52
; RV64-NEXT: fmv.d.x fa5, a1
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a1, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a1, a1
+; RV64-NEXT: and a0, a1, a0
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT: and a0, a0, a1
; RV64-NEXT: frflags a1
; RV64-NEXT: vmv1r.v v0, v6
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -832,10 +832,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV32-NEXT: vfabs.v v24, v8
; RV32-NEXT: lui a2, %hi(.LCPI27_0)
; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: frflags a2
; RV32-NEXT: vmflt.vf v0, v24, fa5
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -870,10 +870,10 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV64-NEXT: li a2, 1075
; RV64-NEXT: slli a2, a2, 52
; RV64-NEXT: fmv.d.x fa5, a2
-; RV64-NEXT: addi a2, a0, -16
-; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a2
+; RV64-NEXT: sltiu a2, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a2, a2
+; RV64-NEXT: and a0, a2, a0
; RV64-NEXT: frflags a2
; RV64-NEXT: vmflt.vf v0, v24, fa5
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index ca9b24e60e503..4e90727b6ebf1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -123,10 +123,10 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32
; CHECK-NEXT: vfmv.s.f v25, fa0
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vfredusum.vs v25, v8, v25, v0.t
-; CHECK-NEXT: addi a1, a0, -32
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 33
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -32
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vfredusum.vs v25, v16, v25, v0.t
@@ -151,10 +151,10 @@ define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m,
; CHECK-NEXT: vfmv.s.f v25, fa0
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t
-; CHECK-NEXT: addi a1, a0, -32
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 33
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -32
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vfredosum.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 3e77020ed0213..27211f153b526 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -654,12 +654,12 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
; CHECK-NEXT: .LBB49_2:
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vmv.s.x v25, a0
-; CHECK-NEXT: addi a0, a1, -32
+; CHECK-NEXT: sltiu a0, a1, 33
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t
-; CHECK-NEXT: sltu a1, a1, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a1, a1, -32
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vredxor.vs v25, v16, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
index 8523ca957a8f5..b5cd2e783ff66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
@@ -211,15 +211,15 @@ define zeroext i1 @vpreduce_and_v256i1(i1 zeroext %s, <256 x i1> %v, <256 x i1>
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: .LBB14_2:
; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: sltiu a3, a1, 129
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a1, a3, a1
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmnot.m v9, v9
; CHECK-NEXT: vcpop.m a2, v9, v0.t
; CHECK-NEXT: seqz a2, a2
; CHECK-NEXT: and a0, a2, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmnot.m v8, v8
; CHECK-NEXT: vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 7540495c0d3b5..41e8d1f982e32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -669,12 +669,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV32-NEXT: vfabs.v v24, v8, v0.t
; RV32-NEXT: lui a1, %hi(.LCPI26_0)
; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32-NEXT: addi a1, a0, -16
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: sltiu a1, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a1, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: and a0, a1, a0
; RV32-NEXT: vmv1r.v v0, v6
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t
@@ -711,12 +711,12 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
; RV64-NEXT: li a1, 1075
; RV64-NEXT: slli a1, a1, 52
; RV64-NEXT: fmv.d.x fa5, a1
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a1, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: and a0, a1, a0
; RV64-NEXT: vmv1r.v v0, v6
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t
@@ -752,10 +752,10 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV32-NEXT: vfabs.v v24, v8
; RV32-NEXT: lui a2, %hi(.LCPI27_0)
; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32-NEXT: addi a2, a0, -16
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: sltiu a2, a0, 17
+; RV32-NEXT: addi a0, a0, -16
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vmflt.vf v0, v24, fa5
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vfabs.v v24, v16
@@ -786,11 +786,11 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
; RV64-NEXT: li a2, 1075
; RV64-NEXT: slli a2, a2, 52
; RV64-NEXT: fmv.d.x fa5, a2
-; RV64-NEXT: addi a2, a0, -16
-; RV64-NEXT: sltu a0, a0, a2
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a2, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a2, a2
+; RV64-NEXT: and a0, a2, a0
; RV64-NEXT: vmflt.vf v0, v24, fa5
-; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vfabs.v v24, v16
; RV64-NEXT: vmflt.vf v7, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index de5427f329496..2d4941744292e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 4
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 4
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 4
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 4
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 4
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 4
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 4
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 4
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 1c923e3f12171..45ea933f427ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 0
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 0
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 0
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 0
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 0
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 0
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 0
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 0
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 83cbd2b760341..3dc45f97e6964 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -1267,10 +1267,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFH-NEXT: addi a1, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a1
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a1
+; RV32ZVFH-NEXT: sltiu a1, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a1, a1
+; RV32ZVFH-NEXT: and a0, a1, a0
; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFH-NEXT: fsrmi a1, 1
@@ -1313,12 +1313,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV64ZVFH-NEXT: li a1, 1075
; RV64ZVFH-NEXT: slli a1, a1, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a1
-; RV64ZVFH-NEXT: addi a1, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a1
-; RV64ZVFH-NEXT: addi a0, a0, -1
+; RV64ZVFH-NEXT: sltiu a1, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a1, a1
+; RV64ZVFH-NEXT: and a0, a1, a0
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFH-NEXT: and a0, a0, a1
; RV64ZVFH-NEXT: fsrmi a1, 1
; RV64ZVFH-NEXT: vmv1r.v v0, v6
; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1358,10 +1358,10 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t
; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1)
-; RV32ZVFHMIN-NEXT: addi a1, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a1
+; RV32ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a1, a1
+; RV32ZVFHMIN-NEXT: and a0, a1, a0
; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
; RV32ZVFHMIN-NEXT: fsrmi a1, 1
@@ -1404,12 +1404,12 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
; RV64ZVFHMIN-NEXT: li a1, 1075
; RV64ZVFHMIN-NEXT: slli a1, a1, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1
-; RV64ZVFHMIN-NEXT: addi a1, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a1
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
+; RV64ZVFHMIN-NEXT: sltiu a1, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a1, a1
+; RV64ZVFHMIN-NEXT: and a0, a1, a0
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu
; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t
-; RV64ZVFHMIN-NEXT: and a0, a0, a1
; RV64ZVFHMIN-NEXT: fsrmi a1, 1
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma
@@ -1449,10 +1449,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV32ZVFH-NEXT: vfabs.v v24, v8
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFH-NEXT: addi a2, a0, -16
-; RV32ZVFH-NEXT: sltu a0, a0, a2
-; RV32ZVFH-NEXT: addi a0, a0, -1
-; RV32ZVFH-NEXT: and a0, a0, a2
+; RV32ZVFH-NEXT: sltiu a2, a0, 17
+; RV32ZVFH-NEXT: addi a0, a0, -16
+; RV32ZVFH-NEXT: neg a2, a2
+; RV32ZVFH-NEXT: and a0, a2, a0
; RV32ZVFH-NEXT: fsrmi a2, 1
; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1487,10 +1487,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV64ZVFH-NEXT: li a2, 1075
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: addi a2, a0, -16
-; RV64ZVFH-NEXT: sltu a0, a0, a2
-; RV64ZVFH-NEXT: addi a0, a0, -1
-; RV64ZVFH-NEXT: and a0, a0, a2
+; RV64ZVFH-NEXT: sltiu a2, a0, 17
+; RV64ZVFH-NEXT: addi a0, a0, -16
+; RV64ZVFH-NEXT: neg a2, a2
+; RV64ZVFH-NEXT: and a0, a2, a0
; RV64ZVFH-NEXT: fsrmi a2, 1
; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1524,10 +1524,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV32ZVFHMIN-NEXT: vfabs.v v24, v8
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0)
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2)
-; RV32ZVFHMIN-NEXT: addi a2, a0, -16
-; RV32ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV32ZVFHMIN-NEXT: addi a0, a0, -1
-; RV32ZVFHMIN-NEXT: and a0, a0, a2
+; RV32ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV32ZVFHMIN-NEXT: addi a0, a0, -16
+; RV32ZVFHMIN-NEXT: neg a2, a2
+; RV32ZVFHMIN-NEXT: and a0, a2, a0
; RV32ZVFHMIN-NEXT: fsrmi a2, 1
; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1562,10 +1562,10 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
; RV64ZVFHMIN-NEXT: li a2, 1075
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: addi a2, a0, -16
-; RV64ZVFHMIN-NEXT: sltu a0, a0, a2
-; RV64ZVFHMIN-NEXT: addi a0, a0, -1
-; RV64ZVFHMIN-NEXT: and a0, a0, a2
+; RV64ZVFHMIN-NEXT: sltiu a2, a0, 17
+; RV64ZVFHMIN-NEXT: addi a0, a0, -16
+; RV64ZVFHMIN-NEXT: neg a2, a2
+; RV64ZVFHMIN-NEXT: and a0, a2, a0
; RV64ZVFHMIN-NEXT: fsrmi a2, 1
; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5
; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index af3e9db9fe123..79f1b88a765b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1076,10 +1076,10 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
; ZVFH-NEXT: .LBB43_2:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vmfeq.vv v6, v8, v24, v0.t
-; ZVFH-NEXT: addi a0, a2, -64
-; ZVFH-NEXT: sltu a1, a2, a0
-; ZVFH-NEXT: addi a1, a1, -1
-; ZVFH-NEXT: and a0, a1, a0
+; ZVFH-NEXT: sltiu a0, a2, 65
+; ZVFH-NEXT: neg a0, a0
+; ZVFH-NEXT: addi a1, a2, -64
+; ZVFH-NEXT: and a0, a0, a1
; ZVFH-NEXT: vmv1r.v v0, v7
; ZVFH-NEXT: addi a1, sp, 16
; ZVFH-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -3929,10 +3929,10 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
; CHECK-NEXT: .LBB87_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v6, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index efc0f7ef4a441..9f354d160d7c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -598,13 +598,13 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
; CHECK-NEXT: addi a4, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a2)
-; CHECK-NEXT: addi a2, a3, -128
+; CHECK-NEXT: sltiu a2, a3, 129
; CHECK-NEXT: vle8.v v24, (a4)
-; CHECK-NEXT: sltu a4, a3, a2
+; CHECK-NEXT: addi a4, a3, -128
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a2, a4, a2
-; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: neg a0, a2
+; CHECK-NEXT: and a0, a0, a4
+; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vv v6, v16, v24, v0.t
; CHECK-NEXT: bltu a3, a1, .LBB51_2
; CHECK-NEXT: # %bb.1:
@@ -636,10 +636,10 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB52_2
@@ -666,10 +666,10 @@ define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m,
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB53_2
@@ -1250,10 +1250,10 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
; CHECK-NEXT: .LBB99_2:
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vv v6, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -32
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 33
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -32
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -1286,10 +1286,10 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze
; CHECK-NEXT: .LBB100_2:
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT: addi a2, a1, -32
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 33
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -32
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t
@@ -1316,10 +1316,10 @@ define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i
; CHECK-NEXT: .LBB101_2:
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT: addi a2, a1, -32
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 33
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -32
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
index a452e5a9ffbb8..9a08596ebb473 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vsext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
; CHECK-NEXT: .LBB12_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsext.vf2 v16, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vsext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsext.vf2 v24, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
index afa8f2fda2ed4..8202ba4e2d815 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vsitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vsitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 8af4ced77be39..45c106240efc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -487,25 +487,24 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: li a4, 16
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB45_2
+; CHECK-NEXT: sltiu a3, a2, 17
+; CHECK-NEXT: addi a4, a2, -16
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: li a5, 16
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: bltu a2, a5, .LBB45_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: li a2, 16
; CHECK-NEXT: .LBB45_2:
-; CHECK-NEXT: mul a4, a3, a1
-; CHECK-NEXT: addi a5, a2, -16
+; CHECK-NEXT: mul a4, a2, a1
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v9, 2
; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sltu a2, a2, a5
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a5
; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vlse64.v v16, (a4), a1, v0.t
; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
; CHECK-NEXT: ret
%load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %m, i32 %evl)
@@ -515,21 +514,20 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) nounwind {
; CHECK-LABEL: strided_vpload_v32f64_allones_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a4, 16
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB46_2
+; CHECK-NEXT: sltiu a3, a2, 17
+; CHECK-NEXT: addi a4, a2, -16
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: li a5, 16
+; CHECK-NEXT: and a3, a3, a4
+; CHECK-NEXT: bltu a2, a5, .LBB46_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: li a2, 16
; CHECK-NEXT: .LBB46_2:
-; CHECK-NEXT: mul a4, a3, a1
-; CHECK-NEXT: addi a5, a2, -16
+; CHECK-NEXT: mul a4, a2, a1
; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sltu a2, a2, a5
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a5
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vlse64.v v16, (a4), a1
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vlse64.v v16, (a4), a1
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vlse64.v v8, (a0), a1
; CHECK-NEXT: ret
%load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> splat (i1 true), i32 %evl)
@@ -549,10 +547,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: li a3, 32
; CHECK-RV32-NEXT: .LBB47_2:
; CHECK-RV32-NEXT: mul a6, a3, a2
-; CHECK-RV32-NEXT: addi a5, a4, -32
-; CHECK-RV32-NEXT: sltu a7, a4, a5
-; CHECK-RV32-NEXT: addi a7, a7, -1
-; CHECK-RV32-NEXT: and a7, a7, a5
+; CHECK-RV32-NEXT: sltiu a5, a4, 33
+; CHECK-RV32-NEXT: addi a7, a4, -32
+; CHECK-RV32-NEXT: neg a5, a5
+; CHECK-RV32-NEXT: and a7, a5, a7
; CHECK-RV32-NEXT: li a5, 16
; CHECK-RV32-NEXT: add a6, a1, a6
; CHECK-RV32-NEXT: bltu a7, a5, .LBB47_4
@@ -563,10 +561,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4
; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV32-NEXT: addi a6, a3, -16
-; CHECK-RV32-NEXT: sltu a3, a3, a6
-; CHECK-RV32-NEXT: addi a3, a3, -1
-; CHECK-RV32-NEXT: and a3, a3, a6
+; CHECK-RV32-NEXT: sltiu a6, a3, 17
+; CHECK-RV32-NEXT: neg a6, a6
+; CHECK-RV32-NEXT: addi a3, a3, -16
+; CHECK-RV32-NEXT: and a3, a6, a3
; CHECK-RV32-NEXT: bltu a4, a5, .LBB47_6
; CHECK-RV32-NEXT: # %bb.5:
; CHECK-RV32-NEXT: li a4, 16
@@ -600,10 +598,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: li a4, 32
; CHECK-RV64-NEXT: .LBB47_2:
; CHECK-RV64-NEXT: mul a6, a4, a2
-; CHECK-RV64-NEXT: addi a5, a3, -32
-; CHECK-RV64-NEXT: sltu a7, a3, a5
-; CHECK-RV64-NEXT: addi a7, a7, -1
-; CHECK-RV64-NEXT: and a7, a7, a5
+; CHECK-RV64-NEXT: sltiu a5, a3, 33
+; CHECK-RV64-NEXT: addi a7, a3, -32
+; CHECK-RV64-NEXT: neg a5, a5
+; CHECK-RV64-NEXT: and a7, a5, a7
; CHECK-RV64-NEXT: li a5, 16
; CHECK-RV64-NEXT: add a6, a1, a6
; CHECK-RV64-NEXT: bltu a7, a5, .LBB47_4
@@ -614,10 +612,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4
; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-RV64-NEXT: vlse64.v v16, (a6), a2, v0.t
-; CHECK-RV64-NEXT: addi a6, a4, -16
-; CHECK-RV64-NEXT: sltu a4, a4, a6
-; CHECK-RV64-NEXT: addi a4, a4, -1
-; CHECK-RV64-NEXT: and a4, a4, a6
+; CHECK-RV64-NEXT: sltiu a6, a4, 17
+; CHECK-RV64-NEXT: neg a6, a6
+; CHECK-RV64-NEXT: addi a4, a4, -16
+; CHECK-RV64-NEXT: and a4, a6, a4
; CHECK-RV64-NEXT: bltu a3, a5, .LBB47_6
; CHECK-RV64-NEXT: # %bb.5:
; CHECK-RV64-NEXT: li a3, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
index 25624ea0fcf6c..c7edae931a126 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
@@ -411,14 +411,14 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid
; CHECK-NEXT: .LBB38_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t
+; CHECK-NEXT: sltiu a4, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
; CHECK-NEXT: mul a3, a3, a1
-; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: addi a3, a2, -16
-; CHECK-NEXT: sltu a2, a2, a3
-; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: neg a4, a4
+; CHECK-NEXT: and a2, a4, a2
+; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v16, (a0), a1, v0.t
; CHECK-NEXT: ret
@@ -437,12 +437,12 @@ define void @strided_store_v32f64_allones_mask(<32 x double> %v, ptr %ptr, i32 s
; CHECK-NEXT: .LBB39_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1
+; CHECK-NEXT: sltiu a4, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
; CHECK-NEXT: mul a3, a3, a1
+; CHECK-NEXT: neg a4, a4
+; CHECK-NEXT: and a2, a4, a2
; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: addi a3, a2, -16
-; CHECK-NEXT: sltu a2, a2, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v16, (a0), a1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index f992d1f8f7eee..f69a4ffde7910 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -56,10 +56,10 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero
; CHECK-NEXT: .LBB4_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT: addi a2, a0, -64
-; CHECK-NEXT: sltu a0, a0, a2
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a2
+; CHECK-NEXT: sltiu a2, a0, 65
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: and a0, a2, a0
; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v24, v16, 0, v0.t
@@ -214,79 +214,85 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV32-NEXT: vmv1r.v v7, v0
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 5
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vslidedown.vi v5, v0, 8
; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vi v4, v0, 4
-; RV32-NEXT: addi a2, a7, -64
-; RV32-NEXT: vslidedown.vi v3, v5, 4
-; RV32-NEXT: sltu a3, a7, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a4, a3, a2
-; RV32-NEXT: addi a2, a4, -32
-; RV32-NEXT: sltu a3, a4, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a3, a3, a2
+; RV32-NEXT: sltiu a2, a7, 65
+; RV32-NEXT: addi a3, a7, -64
+; RV32-NEXT: neg a4, a2
+; RV32-NEXT: and a4, a4, a3
+; RV32-NEXT: sltiu a2, a4, 33
+; RV32-NEXT: addi a3, a4, -32
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and t1, a2, a3
; RV32-NEXT: li a2, 16
-; RV32-NEXT: addi t0, a3, -16
-; RV32-NEXT: mv a5, a3
-; RV32-NEXT: bltu a3, a2, .LBB16_2
+; RV32-NEXT: vslidedown.vi v3, v5, 4
+; RV32-NEXT: mv a5, t1
+; RV32-NEXT: bltu t1, a2, .LBB16_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a5, 16
; RV32-NEXT: .LBB16_2:
-; RV32-NEXT: li t2, 64
-; RV32-NEXT: sltu t1, a3, t0
+; RV32-NEXT: li t0, 64
+; RV32-NEXT: sltiu a3, t1, 17
; RV32-NEXT: mv a6, a7
-; RV32-NEXT: bltu a7, t2, .LBB16_4
+; RV32-NEXT: bltu a7, t0, .LBB16_4
; RV32-NEXT: # %bb.3:
; RV32-NEXT: li a6, 64
; RV32-NEXT: .LBB16_4:
; RV32-NEXT: addi t2, a1, 128
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v6, v4, 2
-; RV32-NEXT: addi t6, a1, 512
-; RV32-NEXT: addi t5, a1, 640
+; RV32-NEXT: addi t5, a1, 512
+; RV32-NEXT: addi t4, a1, 640
; RV32-NEXT: vslidedown.vi v0, v3, 2
-; RV32-NEXT: addi t1, t1, -1
+; RV32-NEXT: neg t0, a3
+; RV32-NEXT: addi t1, t1, -16
; RV32-NEXT: addi t3, a1, 384
; RV32-NEXT: vslidedown.vi v2, v5, 2
; RV32-NEXT: li a3, 32
-; RV32-NEXT: addi t4, a6, -32
-; RV32-NEXT: sltu a6, a6, t4
-; RV32-NEXT: addi a6, a6, -1
-; RV32-NEXT: and a6, a6, t4
-; RV32-NEXT: addi t4, a6, -16
-; RV32-NEXT: sltu s0, a6, t4
-; RV32-NEXT: addi s0, s0, -1
+; RV32-NEXT: sltiu t6, a6, 33
+; RV32-NEXT: addi a6, a6, -32
+; RV32-NEXT: neg t6, t6
+; RV32-NEXT: and a6, t6, a6
+; RV32-NEXT: sltiu t6, a6, 17
+; RV32-NEXT: neg t6, t6
+; RV32-NEXT: addi s0, a6, -16
; RV32-NEXT: bltu a6, a2, .LBB16_6
; RV32-NEXT: # %bb.5:
; RV32-NEXT: li a6, 16
; RV32-NEXT: .LBB16_6:
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle64.v v8, (t6)
-; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: vle64.v v8, (t5)
+; RV32-NEXT: csrr t5, vlenb
; RV32-NEXT: sw a0, 4(sp) # 4-byte Folded Spill
; RV32-NEXT: li a0, 56
-; RV32-NEXT: mul t6, t6, a0
+; RV32-NEXT: mul t5, t5, a0
; RV32-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT: add t6, sp, t6
-; RV32-NEXT: addi t6, t6, 16
-; RV32-NEXT: vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV32-NEXT: vle64.v v8, (t5)
-; RV32-NEXT: vle64.v v16, (t2)
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 16
+; RV32-NEXT: vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vle64.v v16, (t4)
+; RV32-NEXT: vle64.v v8, (t2)
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: li t4, 40
+; RV32-NEXT: mul t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 16
+; RV32-NEXT: vs8r.v v8, (t2) # vscale x 64-byte Folded Spill
; RV32-NEXT: vle64.v v24, (a1)
; RV32-NEXT: csrr t2, vlenb
-; RV32-NEXT: li t5, 48
-; RV32-NEXT: mul t2, t2, t5
+; RV32-NEXT: li t4, 48
+; RV32-NEXT: mul t2, t2, t4
; RV32-NEXT: add t2, sp, t2
; RV32-NEXT: addi t2, t2, 16
; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -296,8 +302,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: add t2, sp, t2
; RV32-NEXT: addi t2, t2, 16
; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV32-NEXT: and t2, t1, t0
-; RV32-NEXT: and t1, s0, t4
+; RV32-NEXT: and t2, t0, t1
+; RV32-NEXT: and t1, t6, s0
; RV32-NEXT: addi a1, a1, 256
; RV32-NEXT: mv t0, a4
; RV32-NEXT: bltu a4, a3, .LBB16_8
@@ -305,45 +311,45 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: li t0, 32
; RV32-NEXT: .LBB16_8:
; RV32-NEXT: vsetvli zero, t2, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v24, v8, 0, v0.t
-; RV32-NEXT: csrr t2, vlenb
-; RV32-NEXT: li t3, 24
-; RV32-NEXT: mul t2, t2, t3
-; RV32-NEXT: add t2, sp, t2
-; RV32-NEXT: addi t2, t2, 16
-; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vnsrl.wi v24, v16, 0, v0.t
; RV32-NEXT: vmv1r.v v0, v3
; RV32-NEXT: csrr t2, vlenb
; RV32-NEXT: li t3, 56
; RV32-NEXT: mul t2, t2, t3
; RV32-NEXT: add t2, sp, t2
; RV32-NEXT: addi t2, t2, 16
-; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v8, (t2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v8, v24, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: slli a5, a5, 6
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
; RV32-NEXT: vmv1r.v v0, v6
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: li t2, 40
+; RV32-NEXT: mul a5, a5, t2
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, t1, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: slli a5, a5, 4
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
-; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a5, t0, -16
-; RV32-NEXT: sltu t0, t0, a5
-; RV32-NEXT: addi t0, t0, -1
-; RV32-NEXT: and a5, t0, a5
+; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: sltiu a5, t0, 17
+; RV32-NEXT: addi t0, t0, -16
+; RV32-NEXT: neg a5, a5
+; RV32-NEXT: and a5, a5, t0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v8, (a1)
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v30, v7, 2
+; RV32-NEXT: vslidedown.vi v28, v7, 2
; RV32-NEXT: vmv1r.v v0, v4
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li t0, 48
@@ -364,9 +370,15 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a5, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
+; RV32-NEXT: vnsrl.wi v8, v16, 0, v0.t
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a1, a1, a5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
; RV32-NEXT: bltu a4, a2, .LBB16_10
; RV32-NEXT: # %bb.9:
; RV32-NEXT: li a4, 16
@@ -375,32 +387,33 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v24, v8, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a4, 48
; RV32-NEXT: mul a1, a1, a4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; RV32-NEXT: mv a1, a7
; RV32-NEXT: bltu a7, a3, .LBB16_12
; RV32-NEXT: # %bb.11:
; RV32-NEXT: li a1, 32
; RV32-NEXT: .LBB16_12:
-; RV32-NEXT: vmv1r.v v0, v30
+; RV32-NEXT: vmv1r.v v0, v28
+; RV32-NEXT: vmv4r.v v8, v24
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
-; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: li a5, 40
+; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 24
+; RV32-NEXT: li a5, 40
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
@@ -417,7 +430,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV32-NEXT: addi a4, a1, -16
+; RV32-NEXT: sltiu a4, a1, 17
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: li a6, 56
; RV32-NEXT: mul a5, a5, a6
@@ -438,7 +451,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: addi a5, a5, 16
; RV32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a5, vlenb
-; RV32-NEXT: li a6, 24
+; RV32-NEXT: li a6, 40
; RV32-NEXT: mul a5, a5, a6
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
@@ -450,11 +463,12 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV32-NEXT: sltu a1, a1, a4
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a4
+; RV32-NEXT: neg a4, a4
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: and a1, a4, a1
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 5
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
@@ -466,35 +480,34 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: .LBB16_14:
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, a7, e32, m4, ta, ma
-; RV32-NEXT: vnsrl.wi v24, v16, 0, v0.t
+; RV32-NEXT: vnsrl.wi v16, v24, 0, v0.t
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vslideup.vi v24, v8, 16
-; RV32-NEXT: vse32.v v24, (a0)
-; RV32-NEXT: addi a1, a0, 256
+; RV32-NEXT: vslideup.vi v16, v8, 16
+; RV32-NEXT: vse32.v v16, (a0)
+; RV32-NEXT: addi a1, a0, 128
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 48
+; RV32-NEXT: li a3, 56
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
-; RV32-NEXT: addi a1, a0, 128
+; RV32-NEXT: addi a1, a0, 384
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 56
-; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: slli a2, a2, 6
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV32-NEXT: vse32.v v8, (a1)
-; RV32-NEXT: addi a0, a0, 384
+; RV32-NEXT: addi a0, a0, 256
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: li a2, 48
+; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
@@ -537,66 +550,66 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: vslidedown.vi v5, v0, 8
; RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64-NEXT: vslidedown.vi v4, v0, 4
-; RV64-NEXT: addi a2, a7, -64
-; RV64-NEXT: vslidedown.vi v3, v5, 4
-; RV64-NEXT: sltu a3, a7, a2
-; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: and a4, a3, a2
-; RV64-NEXT: addi a2, a4, -32
-; RV64-NEXT: sltu a3, a4, a2
-; RV64-NEXT: addi a3, a3, -1
-; RV64-NEXT: and a3, a3, a2
+; RV64-NEXT: sltiu a2, a7, 65
+; RV64-NEXT: addi a3, a7, -64
+; RV64-NEXT: neg a4, a2
+; RV64-NEXT: and a4, a4, a3
+; RV64-NEXT: sltiu a2, a4, 33
+; RV64-NEXT: addi a3, a4, -32
+; RV64-NEXT: neg a2, a2
+; RV64-NEXT: and t1, a2, a3
; RV64-NEXT: li a2, 16
-; RV64-NEXT: addi t0, a3, -16
-; RV64-NEXT: mv a5, a3
-; RV64-NEXT: bltu a3, a2, .LBB16_2
+; RV64-NEXT: vslidedown.vi v3, v5, 4
+; RV64-NEXT: mv a5, t1
+; RV64-NEXT: bltu t1, a2, .LBB16_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a5, 16
; RV64-NEXT: .LBB16_2:
-; RV64-NEXT: li t2, 64
-; RV64-NEXT: sltu t1, a3, t0
+; RV64-NEXT: li t0, 64
+; RV64-NEXT: sltiu a3, t1, 17
; RV64-NEXT: mv a6, a7
-; RV64-NEXT: bltu a7, t2, .LBB16_4
+; RV64-NEXT: bltu a7, t0, .LBB16_4
; RV64-NEXT: # %bb.3:
; RV64-NEXT: li a6, 64
; RV64-NEXT: .LBB16_4:
; RV64-NEXT: addi t2, a1, 128
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v6, v4, 2
-; RV64-NEXT: addi t6, a1, 512
-; RV64-NEXT: addi t5, a1, 640
+; RV64-NEXT: addi t5, a1, 512
+; RV64-NEXT: addi t4, a1, 640
; RV64-NEXT: vslidedown.vi v0, v3, 2
-; RV64-NEXT: addi t1, t1, -1
+; RV64-NEXT: neg t0, a3
+; RV64-NEXT: addi t1, t1, -16
; RV64-NEXT: addi t3, a1, 384
; RV64-NEXT: vslidedown.vi v2, v5, 2
; RV64-NEXT: li a3, 32
-; RV64-NEXT: addi t4, a6, -32
-; RV64-NEXT: sltu a6, a6, t4
-; RV64-NEXT: addi a6, a6, -1
-; RV64-NEXT: and a6, a6, t4
-; RV64-NEXT: addi t4, a6, -16
-; RV64-NEXT: sltu s0, a6, t4
-; RV64-NEXT: addi s0, s0, -1
+; RV64-NEXT: sltiu t6, a6, 33
+; RV64-NEXT: addi a6, a6, -32
+; RV64-NEXT: neg t6, t6
+; RV64-NEXT: and a6, t6, a6
+; RV64-NEXT: sltiu t6, a6, 17
+; RV64-NEXT: neg t6, t6
+; RV64-NEXT: addi s0, a6, -16
; RV64-NEXT: bltu a6, a2, .LBB16_6
; RV64-NEXT: # %bb.5:
; RV64-NEXT: li a6, 16
; RV64-NEXT: .LBB16_6:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v8, (t6)
-; RV64-NEXT: csrr t6, vlenb
+; RV64-NEXT: vle64.v v8, (t5)
+; RV64-NEXT: csrr t5, vlenb
; RV64-NEXT: sd a0, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a0, 56
-; RV64-NEXT: mul t6, t6, a0
+; RV64-NEXT: mul t5, t5, a0
; RV64-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: add t6, sp, t6
-; RV64-NEXT: addi t6, t6, 32
-; RV64-NEXT: vs8r.v v8, (t6) # vscale x 64-byte Folded Spill
-; RV64-NEXT: vle64.v v8, (t5)
+; RV64-NEXT: add t5, sp, t5
+; RV64-NEXT: addi t5, t5, 32
+; RV64-NEXT: vs8r.v v8, (t5) # vscale x 64-byte Folded Spill
+; RV64-NEXT: vle64.v v8, (t4)
; RV64-NEXT: vle64.v v16, (t2)
; RV64-NEXT: vle64.v v24, (a1)
; RV64-NEXT: csrr t2, vlenb
-; RV64-NEXT: li t5, 48
-; RV64-NEXT: mul t2, t2, t5
+; RV64-NEXT: li t4, 48
+; RV64-NEXT: mul t2, t2, t4
; RV64-NEXT: add t2, sp, t2
; RV64-NEXT: addi t2, t2, 32
; RV64-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
@@ -606,8 +619,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add t2, sp, t2
; RV64-NEXT: addi t2, t2, 32
; RV64-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: and t2, t1, t0
-; RV64-NEXT: and t1, s0, t4
+; RV64-NEXT: and t2, t0, t1
+; RV64-NEXT: and t1, t6, s0
; RV64-NEXT: addi a1, a1, 256
; RV64-NEXT: mv t0, a4
; RV64-NEXT: bltu a4, a3, .LBB16_8
@@ -644,10 +657,10 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add a5, sp, a5
; RV64-NEXT: addi a5, a5, 32
; RV64-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT: addi a5, t0, -16
-; RV64-NEXT: sltu t0, t0, a5
-; RV64-NEXT: addi t0, t0, -1
-; RV64-NEXT: and a5, t0, a5
+; RV64-NEXT: sltiu a5, t0, 17
+; RV64-NEXT: addi t0, t0, -16
+; RV64-NEXT: neg a5, a5
+; RV64-NEXT: and a5, a5, t0
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a1)
; RV64-NEXT: addi a1, sp, 32
@@ -727,7 +740,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add a4, sp, a4
; RV64-NEXT: addi a4, a4, 32
; RV64-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; RV64-NEXT: addi a4, a1, -16
+; RV64-NEXT: sltiu a4, a1, 17
; RV64-NEXT: csrr a5, vlenb
; RV64-NEXT: li a6, 56
; RV64-NEXT: mul a5, a5, a6
@@ -760,9 +773,9 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: add a5, sp, a5
; RV64-NEXT: addi a5, a5, 32
; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill
-; RV64-NEXT: sltu a1, a1, a4
-; RV64-NEXT: addi a1, a1, -1
-; RV64-NEXT: and a1, a1, a4
+; RV64-NEXT: neg a4, a4
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: and a1, a4, a1
; RV64-NEXT: csrr a4, vlenb
; RV64-NEXT: slli a4, a4, 5
; RV64-NEXT: add a4, sp, a4
@@ -786,17 +799,17 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV64-NEXT: vslideup.vi v24, v8, 16
; RV64-NEXT: vse32.v v24, (a0)
-; RV64-NEXT: addi a1, a0, 256
+; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 48
+; RV64-NEXT: li a3, 56
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 32
; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: vse32.v v8, (a1)
-; RV64-NEXT: addi a1, a0, 128
+; RV64-NEXT: addi a1, a0, 256
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: li a3, 56
+; RV64-NEXT: li a3, 48
; RV64-NEXT: mul a2, a2, a3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 32
@@ -837,10 +850,10 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext
; CHECK-NEXT: .LBB17_2:
; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v12
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vnsrl.wi v24, v16, 0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
index 3d1febe95421f..cde3f21947824 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll
@@ -372,10 +372,10 @@ define <32 x double> @vuitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 ze
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v16, v16, v0.t
@@ -395,10 +395,10 @@ define <32 x double> @vuitofp_v32f64_v32i64_unmasked(<32 x i64> %va, i32 zeroext
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfcvt.f.xu.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 96dff2464e501..3fc3b47113a32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -354,10 +354,10 @@ define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: addi a0, a1, -128
-; CHECK-NEXT: sltu a3, a1, a0
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: sltiu a0, a1, 129
+; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t
; CHECK-NEXT: bltu a1, a2, .LBB32_2
@@ -383,10 +383,10 @@ define <256 x i8> @vadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB33_2:
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1
; CHECK-NEXT: ret
@@ -1328,10 +1328,10 @@ define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; CHECK-NEXT: .LBB108_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t
@@ -1351,10 +1351,10 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB109_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vadd.vi v16, v16, -1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index da26c63b61e34..f2e051ee41ccb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -453,10 +453,10 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsgnj.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -488,10 +488,10 @@ define <32 x double> @vfsgnj_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsgnj.vv v8, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsgnj.vv v16, v16, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
index 2774aba974a29..12c7009e43a44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
@@ -621,10 +621,10 @@ define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v16, v0.t
@@ -644,10 +644,10 @@ define <32 x double> @vfabs_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index f28b970f48ff7..e863e141376e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -855,10 +855,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -898,27 +898,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: addi a1, a2, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: addi a2, a0, 128
-; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vle64.v v24, (a2)
; CHECK-NEXT: vle64.v v0, (a0)
; CHECK-NEXT: li a1, 16
@@ -927,31 +921,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a0, 16
; CHECK-NEXT: .LBB51_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v0, v8, v16
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v24, v16, v8
+; CHECK-NEXT: vfmadd.vv v24, v8, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: vmv.v.v v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index 403d0b8d57940..484389e29bed9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmax.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB27_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmax.vv v8, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmax.vv v16, v16, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 56f7a8d48c5a1..92564e229bccc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -381,10 +381,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmin.vv v8, v8, v24, v0.t
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
@@ -416,10 +416,10 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB27_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmin.vv v8, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmin.vv v16, v16, v24
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index a9857880b5942..5298b186f2d25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -627,10 +627,10 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
@@ -670,27 +670,21 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
-; CHECK-NEXT: mul a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: addi a1, a2, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: addi a2, a0, 128
-; CHECK-NEXT: vle64.v v8, (a1)
+; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
; CHECK-NEXT: vle64.v v24, (a2)
; CHECK-NEXT: vle64.v v0, (a0)
; CHECK-NEXT: li a1, 16
@@ -699,31 +693,25 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a0, 16
; CHECK-NEXT: .LBB51_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfmadd.vv v0, v8, v16
-; CHECK-NEXT: addi a0, a4, -16
-; CHECK-NEXT: sltu a1, a4, a0
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: sltiu a0, a4, 17
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: addi a4, a4, -16
+; CHECK-NEXT: and a0, a0, a4
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v24, v16, v8
+; CHECK-NEXT: vfmadd.vv v24, v8, v16
; CHECK-NEXT: vmv8r.v v8, v0
; CHECK-NEXT: vmv.v.v v16, v24
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 24
-; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
index 84a89b23bc3b5..2b09bd9a22b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
@@ -589,10 +589,10 @@ define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v16, v16, v0.t
@@ -612,10 +612,10 @@ define <32 x double> @vfneg_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfneg.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
index b431d4873fa1b..9f72f786591a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll
@@ -361,10 +361,10 @@ define <32 x double> @vfsqrt_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zero
; CHECK-NEXT: .LBB26_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v8, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v16, v16, v0.t
@@ -384,10 +384,10 @@ define <32 x double> @vfsqrt_vv_v32f64_unmasked(<32 x double> %va, i32 zeroext %
; CHECK-NEXT: .LBB27_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v8, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vfsqrt.v v16, v16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index f5978de080082..aa7c3d5e113d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmax.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmax_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmax.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmax.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmax.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 7450a70df66ba..3d6dc76d5e70d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vmaxu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmaxu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index 31d19304c2909..5000bea58fa36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -258,10 +258,10 @@ define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmin.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -289,10 +289,10 @@ define <256 x i8> @vmin_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %e
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vmin.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmin.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1001,10 +1001,10 @@ define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vmin.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmin.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index dda69ec8a7d2e..42b05a295e50e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -257,10 +257,10 @@ define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer
; CHECK-NEXT: li a3, 128
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a1)
-; CHECK-NEXT: addi a1, a2, -128
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a1, a4, a1
+; CHECK-NEXT: sltiu a1, a2, 129
+; CHECK-NEXT: addi a4, a2, -128
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a1, a1, a4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vminu.vx v16, v16, a0, v0.t
; CHECK-NEXT: bltu a2, a3, .LBB22_2
@@ -288,10 +288,10 @@ define <256 x i8> @vminu_vx_v258i8_unmasked(<256 x i8> %va, i8 %b, i32 zeroext %
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vminu.vx v8, v8, a0
-; CHECK-NEXT: addi a2, a1, -128
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: sltiu a2, a1, 129
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: addi a1, a1, -128
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vminu.vx v16, v16, a0
; CHECK-NEXT: ret
@@ -1000,10 +1000,10 @@ define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vminu.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vminu.vx v16, v16, a2, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 3f5751aaa2cad..071a726604787 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -285,16 +285,16 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %
; RV64-NEXT: vsext.vf8 v16, v8
; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t
-; RV64-NEXT: addi a2, a1, -16
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma
; RV64-NEXT: vslidedown.vi v8, v8, 16
-; RV64-NEXT: sltu a1, a1, a2
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vsext.vf8 v16, v8
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: addi a1, a1, -1
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t
; RV64-NEXT: li a0, 32
@@ -1997,12 +1997,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
; RV32-NEXT: .LBB94_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v24, (zero), v8, v0.t
-; RV32-NEXT: addi a1, a0, -16
+; RV32-NEXT: sltiu a1, a0, 17
+; RV32-NEXT: addi a0, a0, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a0, a1, a0
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v8, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -2020,12 +2020,12 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
; RV64-NEXT: .LBB94_2:
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (zero), v8, v0.t
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sltiu a1, a0, 17
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: and a0, a1, a0
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (zero), v16, v0.t
; RV64-NEXT: ret
@@ -2048,12 +2048,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
; RV32-NEXT: .LBB95_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2077,12 +2077,12 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
; RV64-NEXT: .LBB95_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2106,12 +2106,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV32-NEXT: .LBB96_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2136,12 +2136,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV64-NEXT: .LBB96_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2168,12 +2168,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV32-NEXT: vluxei16.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei16.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2194,12 +2194,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
; RV64-NEXT: vluxei16.v v8, (a0), v16, v0.t
; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma
; RV64-NEXT: vslidedown.vi v24, v16, 16
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei16.v v16, (a0), v24, v0.t
; RV64-NEXT: ret
@@ -2226,12 +2226,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2253,12 +2253,12 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
; RV64-NEXT: .LBB98_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2284,12 +2284,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2312,12 +2312,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
; RV64-NEXT: .LBB99_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2344,12 +2344,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2370,12 +2370,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
; RV64-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV64-NEXT: vslidedown.vi v24, v16, 16
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV64-NEXT: ret
@@ -2399,12 +2399,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
; RV32-NEXT: .LBB101_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2427,12 +2427,12 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
; RV64-NEXT: .LBB101_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2455,12 +2455,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
; RV32-NEXT: .LBB102_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2483,12 +2483,12 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
; RV64-NEXT: .LBB102_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2512,12 +2512,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
; RV32-NEXT: .LBB103_2:
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
-; RV32-NEXT: addi a2, a1, -16
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: neg a2, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -2540,12 +2540,12 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
; RV64-NEXT: .LBB103_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
@@ -2575,12 +2575,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v24, v16, 16
-; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a1, a1, a2
-; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: sltiu a2, a1, 17
+; RV32-NEXT: addi a1, a1, -16
+; RV32-NEXT: neg a2, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
@@ -2598,12 +2598,12 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
; RV64-NEXT: .LBB104_2:
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: addi a2, a1, -16
-; RV64-NEXT: sltu a1, a1, a2
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a2, a1, 17
+; RV64-NEXT: addi a1, a1, -16
+; RV64-NEXT: neg a2, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: and a1, a2, a1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index d058669c103f3..8e50dffcaf31c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -325,12 +325,12 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
; CHECK-NEXT: .LBB31_2:
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0), v0.t
-; CHECK-NEXT: addi a2, a1, -16
+; CHECK-NEXT: sltiu a2, a1, 17
+; CHECK-NEXT: addi a1, a1, -16
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a0), v0.t
@@ -352,15 +352,15 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: .LBB32_2:
-; CHECK-NEXT: addi a5, a3, -16
+; CHECK-NEXT: sltiu a5, a3, 17
+; CHECK-NEXT: addi a3, a3, -16
; CHECK-NEXT: addi a4, a1, 128
-; CHECK-NEXT: addi a7, a2, -32
-; CHECK-NEXT: sltu a3, a3, a5
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a6, a3, a5
-; CHECK-NEXT: sltu a3, a2, a7
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a5, a3, a7
+; CHECK-NEXT: sltiu a7, a2, 33
+; CHECK-NEXT: neg a5, a5
+; CHECK-NEXT: and a6, a5, a3
+; CHECK-NEXT: addi a3, a2, -32
+; CHECK-NEXT: neg a5, a7
+; CHECK-NEXT: and a5, a5, a3
; CHECK-NEXT: li a3, 16
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v8, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index 0bacb5c26cb4a..3a36cda6dd04a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -1306,12 +1306,12 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
; CHECK-NEXT: .LBB83_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma
; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma
@@ -1339,12 +1339,12 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1>
; CHECK-NEXT: .LBB84_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma
; CHECK-NEXT: vfmerge.vfm v16, v16, fa0, v0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index b4d20d93f2a1c..e509b390a3067 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1703,12 +1703,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
; RV32-NEXT: .LBB83_2:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t
-; RV32-NEXT: addi a0, a1, -16
+; RV32-NEXT: sltiu a0, a1, 17
+; RV32-NEXT: addi a1, a1, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a1, a1, a0
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a0, a1, a0
+; RV32-NEXT: neg a0, a0
+; RV32-NEXT: and a0, a0, a1
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1737,12 +1737,12 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
; RV64-NEXT: .LBB83_2:
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t
-; RV64-NEXT: addi a0, a2, -16
-; RV64-NEXT: sltu a1, a2, a0
-; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: sltiu a0, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a0, a0
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a0, a1, a0
+; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -1773,12 +1773,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
; RV32-NEXT: .LBB84_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT: addi a1, a2, -16
+; RV32-NEXT: sltiu a1, a2, 17
+; RV32-NEXT: addi a2, a2, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a2, a2, a1
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1819,12 +1819,12 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT: addi a1, a2, -16
-; RV64-NEXT: sltu a2, a2, a1
-; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: sltiu a1, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a1, a1, a2
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
@@ -1859,12 +1859,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV32-NEXT: .LBB85_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT: addi a1, a2, -16
+; RV32-NEXT: sltiu a1, a2, 17
+; RV32-NEXT: addi a2, a2, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a2, a2, a1
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1905,12 +1905,12 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT: addi a1, a2, -16
-; RV64-NEXT: sltu a2, a2, a1
-; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: sltiu a1, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a1, a1, a2
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
@@ -1946,12 +1946,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV32-NEXT: .LBB86_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
-; RV32-NEXT: addi a1, a2, -16
+; RV32-NEXT: sltiu a1, a2, 17
+; RV32-NEXT: addi a2, a2, -16
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: sltu a2, a2, a1
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: neg a1, a1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV32-NEXT: vslidedown.vi v8, v24, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
@@ -1992,12 +1992,12 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t
-; RV64-NEXT: addi a1, a2, -16
-; RV64-NEXT: sltu a2, a2, a1
-; RV64-NEXT: addi a2, a2, -1
+; RV64-NEXT: sltiu a1, a2, 17
+; RV64-NEXT: addi a2, a2, -16
+; RV64-NEXT: neg a1, a1
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
-; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a1, a1, a2
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index 855a87d21b7dc..b4e402caf5ba4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -255,12 +255,12 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
; CHECK-NEXT: .LBB24_2:
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a0), v0.t
-; CHECK-NEXT: addi a2, a1, -16
+; CHECK-NEXT: sltiu a2, a1, 17
+; CHECK-NEXT: addi a1, a1, -16
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: neg a2, a2
+; CHECK-NEXT: and a1, a2, a1
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v16, (a0), v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index acaa1e6fa002d..495049e51fb64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -363,10 +363,10 @@ define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: addi a0, a1, -128
-; CHECK-NEXT: sltu a3, a1, a0
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: sltiu a0, a1, 129
+; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t
; CHECK-NEXT: bltu a1, a2, .LBB32_2
@@ -392,10 +392,10 @@ define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB33_2:
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vsadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1
; CHECK-NEXT: ret
@@ -1335,10 +1335,10 @@ define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: .LBB108_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t
@@ -1358,10 +1358,10 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB109_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsadd.vi v16, v16, -1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index 9b3b8348d9b30..a5f57c24aaaaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -359,10 +359,10 @@ define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: addi a0, a1, -128
-; CHECK-NEXT: sltu a3, a1, a0
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: sltiu a0, a1, 129
+; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: neg a0, a0
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t
; CHECK-NEXT: bltu a1, a2, .LBB32_2
@@ -388,10 +388,10 @@ define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB33_2:
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1
; CHECK-NEXT: ret
@@ -1331,10 +1331,10 @@ define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
; CHECK-NEXT: .LBB108_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t
@@ -1354,10 +1354,10 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: .LBB109_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v8, v8, -1
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vsaddu.vi v16, v16, -1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index f2f9f90f386c0..e91477a622b1f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -143,15 +143,15 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
; CHECK-NEXT: vmv1r.v v6, v8
; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a4, a1, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vle8.v v24, (a0)
-; CHECK-NEXT: addi a0, a3, -128
-; CHECK-NEXT: vle8.v v8, (a4)
-; CHECK-NEXT: sltu a4, a3, a0
+; CHECK-NEXT: addi a0, a1, 128
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: sltiu a0, a3, 129
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vle8.v v16, (a1)
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a0, a4, a0
+; CHECK-NEXT: addi a1, a3, -128
+; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: vmv1r.v v0, v6
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
@@ -342,12 +342,12 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: addi a0, a2, -16
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: sltiu a0, a2, 17
+; CHECK-NEXT: addi a2, a2, -16
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 2
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
@@ -511,12 +511,12 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT: addi a0, a2, -32
-; CHECK-NEXT: sltu a1, a2, a0
-; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: sltiu a0, a2, 33
+; CHECK-NEXT: addi a2, a2, -32
+; CHECK-NEXT: neg a0, a0
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v0, 4
-; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 4c7d312e8e785..0947e39ce87e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -373,12 +373,12 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: sltiu a3, a1, 129
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: sltu a0, a1, a3
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a3, a0, a3
+; CHECK-NEXT: addi a0, a1, -128
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a3, a3, a0
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t
@@ -406,10 +406,10 @@ define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vssub.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a2
; CHECK-NEXT: ret
@@ -1376,10 +1376,10 @@ define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a2, v0.t
@@ -1400,10 +1400,10 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssub.vx v16, v16, a2
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index f9000a1b88a6d..12fef2f06bfcf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -368,12 +368,12 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: li a2, 128
-; CHECK-NEXT: addi a3, a1, -128
+; CHECK-NEXT: sltiu a3, a1, 129
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
-; CHECK-NEXT: sltu a0, a1, a3
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a3, a0, a3
+; CHECK-NEXT: addi a0, a1, -128
+; CHECK-NEXT: neg a3, a3
+; CHECK-NEXT: and a3, a3, a0
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t
@@ -401,10 +401,10 @@ define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vssubu.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -128
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 129
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a2
; CHECK-NEXT: ret
@@ -1371,10 +1371,10 @@ define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %e
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v8, v8, a2, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a2, v0.t
@@ -1395,10 +1395,10 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; CHECK-NEXT: li a2, -1
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v8, v8, a2
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vssubu.vx v16, v16, a2
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
index e2d9e0ac2deea..0bdbf1bb54074 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll
@@ -147,10 +147,10 @@ define <32 x i64> @vzext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext
; CHECK-NEXT: .LBB12_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vzext.vf2 v16, v8, v0.t
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vmv1r.v v0, v24
@@ -174,10 +174,10 @@ define <32 x i64> @vzext_v32i64_v32i32_unmasked(<32 x i32> %va, i32 zeroext %evl
; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vzext.vf2 v24, v8
-; CHECK-NEXT: addi a1, a0, -16
-; CHECK-NEXT: sltu a0, a0, a1
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: sltiu a1, a0, 17
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: neg a1, a1
+; CHECK-NEXT: and a0, a1, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 16
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index e2deefa26ecb3..0ed12ddbb0f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 0e0c92b150d33..33ae7ca7d7847 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v8, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 86ed239e99373..173ea25335375 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -270,14 +270,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -398,18 +398,18 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -892,14 +892,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v5, v16, v16, v0.t
@@ -1031,18 +1031,18 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v6, v16, v16, v0.t
@@ -1418,7 +1418,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v8, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vmv1r.v v0, v6
@@ -1509,7 +1509,7 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a4, a2, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a2, a4
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index 736dd1225da88..cbccc96f43cbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -958,7 +958,7 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a6, a2, a3
; CHECK-NEXT: vl8re64.v v8, (a6)
-; CHECK-NEXT: sltu a6, a4, a5
+; CHECK-NEXT: sltu a6, a1, a4
; CHECK-NEXT: addi a6, a6, -1
; CHECK-NEXT: and a5, a6, a5
; CHECK-NEXT: srli a6, a1, 3
@@ -1059,7 +1059,7 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: add a3, a2, a5
; CHECK-NEXT: vl8re64.v v8, (a3)
-; CHECK-NEXT: sltu a3, a4, a6
+; CHECK-NEXT: sltu a3, a1, a4
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a6, a3, a6
; CHECK-NEXT: li a3, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
index c0a794afac3ae..c9478d65058f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -57,7 +57,7 @@ define <vscale x 16 x i64> @llrint_nxv16i64_nxv16f32(<vscale x 16 x float> %x, <
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
index c09df1a60d2ae..4136bab37bc9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -119,7 +119,7 @@ define <vscale x 16 x iXLen> @lrint_nxv16f32(<vscale x 16 x float> %x, <vscale x
; RV64-i64-NEXT: srli a2, a1, 3
; RV64-i64-NEXT: sub a3, a0, a1
; RV64-i64-NEXT: vslidedown.vx v0, v0, a2
-; RV64-i64-NEXT: sltu a2, a0, a3
+; RV64-i64-NEXT: sltu a2, a1, a0
; RV64-i64-NEXT: addi a2, a2, -1
; RV64-i64-NEXT: and a2, a2, a3
; RV64-i64-NEXT: vsetvli zero, a2, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 67e7f7c7fbd42..236ba9096f4f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
index 1ee7e138654b9..3e9c669106a26 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
@@ -24274,7 +24274,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24302,7 +24302,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_P1(<vscale x 16 x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24335,7 +24335,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24363,7 +24363,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_PALL(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24396,7 +24396,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24424,7 +24424,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_S1(<vscale x 16 x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24457,7 +24457,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24485,7 +24485,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_ALL(<vscale x 16 x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24517,7 +24517,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24545,7 +24545,7 @@ define <vscale x 16 x i8> @test_nontemporal_vp_gather_nxv16i8_DEFAULT(<vscale x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
@@ -24586,10 +24586,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24614,10 +24614,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_P1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24647,10 +24647,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24675,10 +24675,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_PALL(<vscale x 16 x i8> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24708,10 +24708,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24736,10 +24736,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_S1(<vscale x 16 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24769,10 +24769,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24797,10 +24797,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_ALL(<vscale x 16 x i8> %val, <v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -24829,10 +24829,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v9, (zero), v24
@@ -24857,10 +24857,10 @@ define void @test_nontemporal_vp_scatter_nxv16i8_DEFAULT(<vscale x 16 x i8> %val
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v9, (zero), v24
@@ -25538,7 +25538,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25566,7 +25566,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_P1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25599,7 +25599,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25627,7 +25627,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_PALL(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25660,7 +25660,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25688,7 +25688,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_S1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25721,7 +25721,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25749,7 +25749,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_ALL(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25781,7 +25781,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25809,7 +25809,7 @@ define <vscale x 16 x i16> @test_nontemporal_vp_gather_nxv16i16_DEFAULT(<vscale
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
@@ -25850,10 +25850,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -25878,10 +25878,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_P1(<vscale x 16 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -25911,10 +25911,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -25939,10 +25939,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_PALL(<vscale x 16 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -25972,10 +25972,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -26000,10 +26000,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_S1(<vscale x 16 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -26033,10 +26033,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -26061,10 +26061,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_ALL(<vscale x 16 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -26093,10 +26093,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v24
@@ -26121,10 +26121,10 @@ define void @test_nontemporal_vp_scatter_nxv16i16_DEFAULT(<vscale x 16 x i16> %v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v24
@@ -26802,7 +26802,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26829,7 +26829,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_P1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26861,7 +26861,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26888,7 +26888,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_PALL(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26920,7 +26920,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26947,7 +26947,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_S1(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -26979,7 +26979,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27006,7 +27006,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_ALL(<vscale x 16
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27037,7 +27037,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27064,7 +27064,7 @@ define <vscale x 16 x i32> @test_nontemporal_vp_gather_nxv16i32_DEFAULT(<vscale
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -27104,10 +27104,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27132,10 +27132,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_P1(<vscale x 16 x i32> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27165,10 +27165,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27193,10 +27193,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_PALL(<vscale x 16 x i32> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27226,10 +27226,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27254,10 +27254,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_S1(<vscale x 16 x i32> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27287,10 +27287,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27315,10 +27315,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_ALL(<vscale x 16 x i32> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -27347,10 +27347,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -27375,10 +27375,10 @@ define void @test_nontemporal_vp_scatter_nxv16i32_DEFAULT(<vscale x 16 x i32> %v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28056,7 +28056,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28083,7 +28083,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_P1(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28115,7 +28115,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28142,7 +28142,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_PALL(<vscale x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28174,7 +28174,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28201,7 +28201,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_S1(<vscale x 1
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28233,7 +28233,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28260,7 +28260,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_ALL(<vscale x
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28291,7 +28291,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
; CHECK-RV64V: # %bb.0:
; CHECK-RV64V-NEXT: csrr a1, vlenb
; CHECK-RV64V-NEXT: sub a2, a0, a1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, a1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28318,7 +28318,7 @@ define <vscale x 16 x float> @test_nontemporal_vp_gather_nxv16f32_DEFAULT(<vscal
; CHECK-RV64VC: # %bb.0:
; CHECK-RV64VC-NEXT: csrr a1, vlenb
; CHECK-RV64VC-NEXT: sub a2, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, a1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -28358,10 +28358,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28386,10 +28386,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_P1(<vscale x 16 x float> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28419,10 +28419,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28447,10 +28447,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_PALL(<vscale x 16 x float> %va
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28480,10 +28480,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28508,10 +28508,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_S1(<vscale x 16 x float> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28541,10 +28541,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28569,10 +28569,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_ALL(<vscale x 16 x float> %val
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -28601,10 +28601,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
; CHECK-RV64V-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64V-NEXT: sub a0, a1, a0
-; CHECK-RV64V-NEXT: sltu a1, a1, a0
-; CHECK-RV64V-NEXT: addi a1, a1, -1
-; CHECK-RV64V-NEXT: and a0, a1, a0
+; CHECK-RV64V-NEXT: sub a2, a1, a0
+; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
@@ -28629,10 +28629,10 @@ define void @test_nontemporal_vp_scatter_nxv16f32_DEFAULT(<vscale x 16 x float>
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
-; CHECK-RV64VC-NEXT: sub a0, a1, a0
-; CHECK-RV64VC-NEXT: sltu a1, a1, a0
-; CHECK-RV64VC-NEXT: addi a1, a1, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a1, a0
+; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
@@ -29322,12 +29322,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB850_2
@@ -29345,7 +29345,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB850_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29374,7 +29374,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29406,12 +29406,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB850_2
@@ -29429,7 +29429,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB850_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29458,7 +29458,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_P1(<vscale x 32 x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29495,12 +29495,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB851_2
@@ -29518,7 +29518,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB851_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29547,7 +29547,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29579,12 +29579,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB851_2
@@ -29602,7 +29602,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB851_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29631,7 +29631,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_PALL(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29668,12 +29668,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB852_2
@@ -29691,7 +29691,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB852_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29720,7 +29720,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29752,12 +29752,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB852_2
@@ -29775,7 +29775,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB852_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29804,7 +29804,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_S1(<vscale x 32 x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29841,12 +29841,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB853_2
@@ -29864,7 +29864,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB853_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29893,7 +29893,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -29925,12 +29925,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB853_2
@@ -29948,7 +29948,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB853_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -29977,7 +29977,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_ALL(<vscale x 32 x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -30013,12 +30013,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB854_2
@@ -30036,7 +30036,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB854_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30065,7 +30065,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -30097,12 +30097,12 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB854_2
@@ -30120,7 +30120,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB854_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30149,7 +30149,7 @@ define <vscale x 32 x i8> @test_nontemporal_vp_gather_nxv32i8_DEFAULT(<vscale x
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
@@ -30226,13 +30226,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30244,10 +30244,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30277,10 +30277,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -30342,13 +30342,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -30360,10 +30360,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30393,10 +30393,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_P1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -30463,13 +30463,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30481,10 +30481,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30514,10 +30514,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -30579,13 +30579,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -30597,10 +30597,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30630,10 +30630,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_PALL(<vscale x 32 x i8> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -30700,13 +30700,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30718,10 +30718,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30751,10 +30751,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -30816,13 +30816,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -30834,10 +30834,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30867,10 +30867,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_S1(<vscale x 32 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -30937,13 +30937,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -30955,10 +30955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -30988,10 +30988,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -31053,13 +31053,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -31071,10 +31071,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -31104,10 +31104,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_ALL(<vscale x 32 x i8> %val, <v
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -31173,13 +31173,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v17, (zero), v0
@@ -31191,10 +31191,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -31224,10 +31224,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v24
@@ -31289,13 +31289,13 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v16, (zero), v8
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v17, (zero), v0
@@ -31307,10 +31307,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v18, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
@@ -31340,10 +31340,10 @@ define void @test_nontemporal_vp_scatter_nxv32i8_DEFAULT(<vscale x 32 x i8> %val
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v24
@@ -32026,12 +32026,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB880_2
@@ -32049,7 +32049,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB880_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32078,7 +32078,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32110,12 +32110,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB880_2
@@ -32133,7 +32133,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB880_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32162,7 +32162,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_P1(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32199,12 +32199,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB881_2
@@ -32222,7 +32222,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB881_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32251,7 +32251,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32283,12 +32283,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB881_2
@@ -32306,7 +32306,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB881_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32335,7 +32335,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_PALL(<vscale x 3
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32372,12 +32372,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB882_2
@@ -32395,7 +32395,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB882_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32424,7 +32424,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32456,12 +32456,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB882_2
@@ -32479,7 +32479,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB882_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32508,7 +32508,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_S1(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32545,12 +32545,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB883_2
@@ -32568,7 +32568,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB883_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32597,7 +32597,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32629,12 +32629,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB883_2
@@ -32652,7 +32652,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB883_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32681,7 +32681,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_ALL(<vscale x 32
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32717,12 +32717,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64V-NEXT: slli a3, a1, 1
; CHECK-RV64V-NEXT: add a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a2, a3
+; CHECK-RV64V-NEXT: sltu a5, a3, a2
; CHECK-RV64V-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64V-NEXT: sltu a0, a2, a4
-; CHECK-RV64V-NEXT: addi a0, a0, -1
+; CHECK-RV64V-NEXT: addi a0, a5, -1
; CHECK-RV64V-NEXT: and a0, a0, a4
; CHECK-RV64V-NEXT: sub a4, a0, a1
-; CHECK-RV64V-NEXT: sltu a5, a0, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a0
; CHECK-RV64V-NEXT: addi a5, a5, -1
; CHECK-RV64V-NEXT: and a4, a5, a4
; CHECK-RV64V-NEXT: bltu a0, a1, .LBB884_2
@@ -32740,7 +32740,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64V-NEXT: mv a2, a3
; CHECK-RV64V-NEXT: .LBB884_4:
; CHECK-RV64V-NEXT: sub a0, a2, a1
-; CHECK-RV64V-NEXT: sltu a3, a2, a0
+; CHECK-RV64V-NEXT: sltu a3, a1, a2
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a0, a3, a0
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32769,7 +32769,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: slli a1, a1, 1
; CHECK-RV32V-NEXT: sub a2, a0, a1
-; CHECK-RV32V-NEXT: sltu a3, a0, a2
+; CHECK-RV32V-NEXT: sltu a3, a1, a0
; CHECK-RV32V-NEXT: addi a3, a3, -1
; CHECK-RV32V-NEXT: and a2, a3, a2
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32801,12 +32801,12 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64VC-NEXT: slli a3, a1, 1
; CHECK-RV64VC-NEXT: add a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a2, a3
+; CHECK-RV64VC-NEXT: sltu a5, a3, a2
; CHECK-RV64VC-NEXT: vl8re64.v v0, (a0)
-; CHECK-RV64VC-NEXT: sltu a0, a2, a4
-; CHECK-RV64VC-NEXT: addi a0, a0, -1
+; CHECK-RV64VC-NEXT: addi a0, a5, -1
; CHECK-RV64VC-NEXT: and a0, a0, a4
; CHECK-RV64VC-NEXT: sub a4, a0, a1
-; CHECK-RV64VC-NEXT: sltu a5, a0, a4
+; CHECK-RV64VC-NEXT: sltu a5, a1, a0
; CHECK-RV64VC-NEXT: addi a5, a5, -1
; CHECK-RV64VC-NEXT: and a4, a4, a5
; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB884_2
@@ -32824,7 +32824,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV64VC-NEXT: mv a2, a3
; CHECK-RV64VC-NEXT: .LBB884_4:
; CHECK-RV64VC-NEXT: sub a0, a2, a1
-; CHECK-RV64VC-NEXT: sltu a3, a2, a0
+; CHECK-RV64VC-NEXT: sltu a3, a1, a2
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a0, a0, a3
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32853,7 +32853,7 @@ define <vscale x 32 x i16> @test_nontemporal_vp_gather_nxv32i16_DEFAULT(<vscale
; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: slli a1, a1, 1
; CHECK-RV32VC-NEXT: sub a2, a0, a1
-; CHECK-RV32VC-NEXT: sltu a3, a0, a2
+; CHECK-RV32VC-NEXT: sltu a3, a1, a0
; CHECK-RV32VC-NEXT: addi a3, a3, -1
; CHECK-RV32VC-NEXT: and a2, a2, a3
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -32907,13 +32907,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -32925,10 +32925,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -32955,10 +32955,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -32997,13 +32997,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33015,10 +33015,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33045,10 +33045,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_P1(<vscale x 32 x i16> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33092,13 +33092,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33110,10 +33110,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33140,10 +33140,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33182,13 +33182,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33200,10 +33200,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33230,10 +33230,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_PALL(<vscale x 32 x i16> %val,
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33277,13 +33277,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33295,10 +33295,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33325,10 +33325,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33367,13 +33367,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33385,10 +33385,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33415,10 +33415,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_S1(<vscale x 32 x i16> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33462,13 +33462,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33480,10 +33480,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33510,10 +33510,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33552,13 +33552,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33570,10 +33570,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33600,10 +33600,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_ALL(<vscale x 32 x i16> %val,
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -33646,13 +33646,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, a1
-; CHECK-RV64V-NEXT: sub a2, a3, a2
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a2
+; CHECK-RV64V-NEXT: sltu a4, a1, a4
+; CHECK-RV64V-NEXT: sub a5, a3, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
+; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
-; CHECK-RV64V-NEXT: and a0, a3, a2
+; CHECK-RV64V-NEXT: and a0, a2, a5
; CHECK-RV64V-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v10, (zero), v0
@@ -33664,10 +33664,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64V-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a2, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a0, a0, a1
+; CHECK-RV64V-NEXT: and a0, a0, a2
; CHECK-RV64V-NEXT: addi a1, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33694,10 +33694,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV32V-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32V-NEXT: sub a0, a1, a0
-; CHECK-RV32V-NEXT: sltu a1, a1, a0
-; CHECK-RV32V-NEXT: addi a1, a1, -1
-; CHECK-RV32V-NEXT: and a0, a1, a0
+; CHECK-RV32V-NEXT: sub a2, a1, a0
+; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: addi a0, a0, -1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
@@ -33736,13 +33736,13 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v8, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, a1
-; CHECK-RV64VC-NEXT: sub a2, a3, a2
-; CHECK-RV64VC-NEXT: sltu a4, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a2
+; CHECK-RV64VC-NEXT: sltu a4, a1, a4
+; CHECK-RV64VC-NEXT: sub a5, a3, a2
+; CHECK-RV64VC-NEXT: sltu a2, a2, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: addi a3, a3, -1
+; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a4, a4, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a2
+; CHECK-RV64VC-NEXT: and a0, a2, a5
; CHECK-RV64VC-NEXT: vsetvli zero, a4, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v10, (zero), v0
@@ -33754,10 +33754,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e16, m2, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: sub a1, a0, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sub a2, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
-; CHECK-RV64VC-NEXT: and a0, a0, a1
+; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: addi a1, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e16, m2, ta, ma
@@ -33784,10 +33784,10 @@ define void @test_nontemporal_vp_scatter_nxv32i16_DEFAULT(<vscale x 32 x i16> %v
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
-; CHECK-RV32VC-NEXT: sub a0, a1, a0
-; CHECK-RV32VC-NEXT: sltu a1, a1, a0
-; CHECK-RV32VC-NEXT: addi a1, a1, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a1, a0
+; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: addi a0, a0, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
@@ -34527,30 +34527,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB910_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB910_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB910_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB910_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB910_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB910_6
@@ -34560,26 +34560,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB910_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -34589,11 +34589,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB910_10
@@ -34619,7 +34619,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB910_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -34674,45 +34674,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB910_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB910_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB910_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB910_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB910_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB910_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB910_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB910_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB910_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -34800,33 +34800,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB910_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB910_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB910_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB910_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB910_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB910_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -34834,7 +34834,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -34842,7 +34842,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -34853,21 +34853,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB910_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB910_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB910_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB910_10
@@ -34893,7 +34893,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB910_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -34948,45 +34948,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_P1(<vscale x 64 x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB910_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB910_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB910_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB910_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB910_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB910_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB910_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB910_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB910_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35080,30 +35080,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB911_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB911_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB911_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB911_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB911_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB911_6
@@ -35113,26 +35113,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB911_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -35142,11 +35142,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB911_10
@@ -35172,7 +35172,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB911_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -35227,45 +35227,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB911_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB911_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB911_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB911_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB911_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB911_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB911_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB911_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB911_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35353,33 +35353,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB911_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB911_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB911_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB911_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB911_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB911_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -35387,7 +35387,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35395,7 +35395,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35406,21 +35406,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB911_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB911_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB911_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB911_10
@@ -35446,7 +35446,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB911_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -35501,45 +35501,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_PALL(<vscale x 64
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB911_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB911_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB911_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB911_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB911_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB911_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB911_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB911_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB911_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35633,30 +35633,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB912_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB912_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB912_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB912_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB912_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB912_6
@@ -35666,26 +35666,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB912_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -35695,11 +35695,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB912_10
@@ -35725,7 +35725,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB912_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -35780,45 +35780,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB912_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB912_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB912_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB912_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB912_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB912_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB912_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB912_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB912_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -35906,33 +35906,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB912_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB912_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB912_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB912_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB912_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB912_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -35940,7 +35940,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35948,7 +35948,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -35959,21 +35959,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB912_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB912_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB912_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB912_10
@@ -35999,7 +35999,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB912_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -36054,45 +36054,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_S1(<vscale x 64 x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB912_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB912_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB912_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB912_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB912_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB912_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB912_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB912_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB912_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36186,30 +36186,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB913_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB913_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB913_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB913_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB913_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB913_6
@@ -36219,26 +36219,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB913_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -36248,11 +36248,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB913_10
@@ -36278,7 +36278,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB913_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -36333,45 +36333,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB913_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB913_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB913_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB913_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB913_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB913_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB913_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB913_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB913_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36459,33 +36459,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB913_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB913_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB913_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB913_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB913_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB913_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -36493,7 +36493,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -36501,7 +36501,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -36512,21 +36512,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB913_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB913_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB913_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB913_10
@@ -36552,7 +36552,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB913_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -36607,45 +36607,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_ALL(<vscale x 64 x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB913_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB913_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB913_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB913_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB913_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB913_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB913_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB913_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB913_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -36738,30 +36738,30 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: mv a0, s1
; CHECK-RV64V-NEXT: call __muldi3
; CHECK-RV64V-NEXT: slli a2, s1, 2
-; CHECK-RV64V-NEXT: sub a1, s0, a2
-; CHECK-RV64V-NEXT: sltu a3, s0, a1
-; CHECK-RV64V-NEXT: addi a3, a3, -1
-; CHECK-RV64V-NEXT: and a3, a3, a1
; CHECK-RV64V-NEXT: slli a1, s1, 1
+; CHECK-RV64V-NEXT: sub a3, s0, a2
+; CHECK-RV64V-NEXT: sltu a4, a2, s0
+; CHECK-RV64V-NEXT: addi a4, a4, -1
+; CHECK-RV64V-NEXT: and a3, a4, a3
; CHECK-RV64V-NEXT: sub a4, a3, a1
-; CHECK-RV64V-NEXT: sltu a5, a3, a4
+; CHECK-RV64V-NEXT: sltu a5, a1, a3
; CHECK-RV64V-NEXT: addi a5, a5, -1
-; CHECK-RV64V-NEXT: and a6, a5, a4
-; CHECK-RV64V-NEXT: sub a4, a6, s1
-; CHECK-RV64V-NEXT: mv a5, a6
-; CHECK-RV64V-NEXT: bltu a6, s1, .LBB914_2
+; CHECK-RV64V-NEXT: and a5, a5, a4
+; CHECK-RV64V-NEXT: mv a4, a5
+; CHECK-RV64V-NEXT: bltu a5, s1, .LBB914_2
; CHECK-RV64V-NEXT: # %bb.1:
-; CHECK-RV64V-NEXT: mv a5, s1
+; CHECK-RV64V-NEXT: mv a4, s1
; CHECK-RV64V-NEXT: .LBB914_2:
-; CHECK-RV64V-NEXT: sltu a7, a6, a4
+; CHECK-RV64V-NEXT: sltu a7, s1, a5
; CHECK-RV64V-NEXT: bltu a3, a1, .LBB914_4
; CHECK-RV64V-NEXT: # %bb.3:
; CHECK-RV64V-NEXT: mv a3, a1
; CHECK-RV64V-NEXT: .LBB914_4:
; CHECK-RV64V-NEXT: add a6, s2, a0
-; CHECK-RV64V-NEXT: addi a0, a7, -1
+; CHECK-RV64V-NEXT: sub a0, a5, s1
+; CHECK-RV64V-NEXT: addi a5, a7, -1
; CHECK-RV64V-NEXT: sub a7, a3, s1
-; CHECK-RV64V-NEXT: sltu t0, a3, a7
+; CHECK-RV64V-NEXT: sltu t0, s1, a3
; CHECK-RV64V-NEXT: addi t0, t0, -1
; CHECK-RV64V-NEXT: and a7, t0, a7
; CHECK-RV64V-NEXT: bltu a3, s1, .LBB914_6
@@ -36771,26 +36771,26 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: vl8re64.v v16, (a6)
; CHECK-RV64V-NEXT: addi a6, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v14, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 3
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 3
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a7, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v13, (zero), v24
-; CHECK-RV64V-NEXT: csrr a5, vlenb
-; CHECK-RV64V-NEXT: slli a5, a5, 4
-; CHECK-RV64V-NEXT: add a5, sp, a5
-; CHECK-RV64V-NEXT: addi a5, a5, 16
-; CHECK-RV64V-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: csrr a4, vlenb
+; CHECK-RV64V-NEXT: slli a4, a4, 4
+; CHECK-RV64V-NEXT: add a4, sp, a4
+; CHECK-RV64V-NEXT: addi a4, a4, 16
+; CHECK-RV64V-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64V-NEXT: and a0, a0, a4
+; CHECK-RV64V-NEXT: and a0, a5, a0
; CHECK-RV64V-NEXT: bltu s0, a2, .LBB914_8
; CHECK-RV64V-NEXT: # %bb.7:
; CHECK-RV64V-NEXT: mv s0, a2
@@ -36800,11 +36800,11 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64V-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64V-NEXT: sub a0, s0, a1
-; CHECK-RV64V-NEXT: sltu a2, s0, a0
+; CHECK-RV64V-NEXT: sltu a2, a1, s0
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: and a0, a2, a0
; CHECK-RV64V-NEXT: sub a2, a0, s1
-; CHECK-RV64V-NEXT: sltu a3, a0, a2
+; CHECK-RV64V-NEXT: sltu a3, s1, a0
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a2, a3, a2
; CHECK-RV64V-NEXT: bltu a0, s1, .LBB914_10
@@ -36830,7 +36830,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64V-NEXT: mv s0, a1
; CHECK-RV64V-NEXT: .LBB914_12:
; CHECK-RV64V-NEXT: sub a0, s0, s1
-; CHECK-RV64V-NEXT: sltu a1, s0, a0
+; CHECK-RV64V-NEXT: sltu a1, s1, s0
; CHECK-RV64V-NEXT: addi a1, a1, -1
; CHECK-RV64V-NEXT: and a0, a1, a0
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -36885,45 +36885,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV32V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32V-NEXT: csrr a4, vlenb
+; CHECK-RV32V-NEXT: csrr a1, vlenb
; CHECK-RV32V-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32V-NEXT: slli a3, a4, 3
-; CHECK-RV32V-NEXT: slli a1, a4, 2
-; CHECK-RV32V-NEXT: add a0, a0, a3
-; CHECK-RV32V-NEXT: sub a3, a2, a1
+; CHECK-RV32V-NEXT: slli a4, a1, 3
+; CHECK-RV32V-NEXT: slli a3, a1, 2
+; CHECK-RV32V-NEXT: slli a1, a1, 1
+; CHECK-RV32V-NEXT: add a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a2, a3
+; CHECK-RV32V-NEXT: sltu a5, a3, a2
; CHECK-RV32V-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32V-NEXT: sltu a0, a2, a3
-; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a3, a0, a3
-; CHECK-RV32V-NEXT: slli a0, a4, 1
-; CHECK-RV32V-NEXT: sub a4, a3, a0
-; CHECK-RV32V-NEXT: sltu a5, a3, a4
+; CHECK-RV32V-NEXT: addi a0, a5, -1
+; CHECK-RV32V-NEXT: and a0, a0, a4
+; CHECK-RV32V-NEXT: sub a4, a0, a1
+; CHECK-RV32V-NEXT: sltu a5, a1, a0
; CHECK-RV32V-NEXT: addi a5, a5, -1
; CHECK-RV32V-NEXT: and a4, a5, a4
-; CHECK-RV32V-NEXT: bltu a3, a0, .LBB914_2
+; CHECK-RV32V-NEXT: bltu a0, a1, .LBB914_2
; CHECK-RV32V-NEXT: # %bb.1:
-; CHECK-RV32V-NEXT: mv a3, a0
+; CHECK-RV32V-NEXT: mv a0, a1
; CHECK-RV32V-NEXT: .LBB914_2:
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32V-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32V-NEXT: bltu a2, a1, .LBB914_4
+; CHECK-RV32V-NEXT: bltu a2, a3, .LBB914_4
; CHECK-RV32V-NEXT: # %bb.3:
-; CHECK-RV32V-NEXT: mv a2, a1
+; CHECK-RV32V-NEXT: mv a2, a3
; CHECK-RV32V-NEXT: .LBB914_4:
-; CHECK-RV32V-NEXT: sub a1, a2, a0
-; CHECK-RV32V-NEXT: sltu a3, a2, a1
+; CHECK-RV32V-NEXT: sub a0, a2, a1
+; CHECK-RV32V-NEXT: sltu a3, a1, a2
; CHECK-RV32V-NEXT: addi a3, a3, -1
-; CHECK-RV32V-NEXT: and a1, a3, a1
-; CHECK-RV32V-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32V-NEXT: and a0, a3, a0
+; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32V-NEXT: bltu a2, a0, .LBB914_6
+; CHECK-RV32V-NEXT: bltu a2, a1, .LBB914_6
; CHECK-RV32V-NEXT: # %bb.5:
-; CHECK-RV32V-NEXT: mv a2, a0
+; CHECK-RV32V-NEXT: mv a2, a1
; CHECK-RV32V-NEXT: .LBB914_6:
; CHECK-RV32V-NEXT: addi a0, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37011,33 +37011,33 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: li a1, 40
; CHECK-RV64VC-NEXT: mv a0, s1
; CHECK-RV64VC-NEXT: call __muldi3
-; CHECK-RV64VC-NEXT: slli a7, s1, 2
-; CHECK-RV64VC-NEXT: sub a1, s0, a7
-; CHECK-RV64VC-NEXT: sltu a2, s0, a1
-; CHECK-RV64VC-NEXT: addi a2, a2, -1
-; CHECK-RV64VC-NEXT: and a3, a2, a1
+; CHECK-RV64VC-NEXT: slli a2, s1, 2
; CHECK-RV64VC-NEXT: slli a1, s1, 1
-; CHECK-RV64VC-NEXT: sub a2, a3, a1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a3, s0, a2
+; CHECK-RV64VC-NEXT: sltu a4, a2, s0
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
-; CHECK-RV64VC-NEXT: sub t0, a2, s1
-; CHECK-RV64VC-NEXT: mv a5, a2
-; CHECK-RV64VC-NEXT: bltu a2, s1, .LBB914_2
+; CHECK-RV64VC-NEXT: and a3, a3, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a5, a1, a3
+; CHECK-RV64VC-NEXT: addi a5, a5, -1
+; CHECK-RV64VC-NEXT: and a5, a5, a4
+; CHECK-RV64VC-NEXT: mv t0, a5
+; CHECK-RV64VC-NEXT: bltu a5, s1, .LBB914_2
; CHECK-RV64VC-NEXT: # %bb.1:
-; CHECK-RV64VC-NEXT: mv a5, s1
+; CHECK-RV64VC-NEXT: mv t0, s1
; CHECK-RV64VC-NEXT: .LBB914_2:
-; CHECK-RV64VC-NEXT: sltu a6, a2, t0
+; CHECK-RV64VC-NEXT: sltu a7, s1, a5
; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB914_4
; CHECK-RV64VC-NEXT: # %bb.3:
; CHECK-RV64VC-NEXT: mv a3, a1
; CHECK-RV64VC-NEXT: .LBB914_4:
; CHECK-RV64VC-NEXT: add a0, a0, s2
-; CHECK-RV64VC-NEXT: addi a6, a6, -1
-; CHECK-RV64VC-NEXT: sub a2, a3, s1
-; CHECK-RV64VC-NEXT: sltu a4, a3, a2
+; CHECK-RV64VC-NEXT: sub a6, a5, s1
+; CHECK-RV64VC-NEXT: addi a7, a7, -1
+; CHECK-RV64VC-NEXT: sub a5, a3, s1
+; CHECK-RV64VC-NEXT: sltu a4, s1, a3
; CHECK-RV64VC-NEXT: addi a4, a4, -1
-; CHECK-RV64VC-NEXT: and a2, a2, a4
+; CHECK-RV64VC-NEXT: and a5, a5, a4
; CHECK-RV64VC-NEXT: bltu a3, s1, .LBB914_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a3, s1
@@ -37045,7 +37045,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a0)
; CHECK-RV64VC-NEXT: addi a0, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, t0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v14, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -37053,7 +37053,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: add a0, a0, sp
; CHECK-RV64VC-NEXT: addi a0, a0, 16
; CHECK-RV64VC-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v13, (zero), v24
; CHECK-RV64VC-NEXT: csrr a0, vlenb
@@ -37064,21 +37064,21 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v12, (zero), v24
-; CHECK-RV64VC-NEXT: and a0, a6, t0
-; CHECK-RV64VC-NEXT: bltu s0, a7, .LBB914_8
+; CHECK-RV64VC-NEXT: and a0, a7, a6
+; CHECK-RV64VC-NEXT: bltu s0, a2, .LBB914_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv s0, a7
+; CHECK-RV64VC-NEXT: mv s0, a2
; CHECK-RV64VC-NEXT: .LBB914_8:
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vluxei64.v v15, (zero), v16
; CHECK-RV64VC-NEXT: vl8re64.v v16, (s2)
; CHECK-RV64VC-NEXT: sub a0, s0, a1
-; CHECK-RV64VC-NEXT: sltu a2, s0, a0
+; CHECK-RV64VC-NEXT: sltu a2, a1, s0
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: and a0, a0, a2
; CHECK-RV64VC-NEXT: sub a2, a0, s1
-; CHECK-RV64VC-NEXT: sltu a3, a0, a2
+; CHECK-RV64VC-NEXT: sltu a3, s1, a0
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a3
; CHECK-RV64VC-NEXT: bltu a0, s1, .LBB914_10
@@ -37104,7 +37104,7 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV64VC-NEXT: mv s0, a1
; CHECK-RV64VC-NEXT: .LBB914_12:
; CHECK-RV64VC-NEXT: sub a0, s0, s1
-; CHECK-RV64VC-NEXT: sltu a1, s0, a0
+; CHECK-RV64VC-NEXT: sltu a1, s1, s0
; CHECK-RV64VC-NEXT: addi a1, a1, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -37159,45 +37159,45 @@ define <vscale x 64 x i8> @test_nontemporal_vp_gather_nxv64i8_DEFAULT(<vscale x
; CHECK-RV32VC-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV32VC-NEXT: csrr a4, vlenb
+; CHECK-RV32VC-NEXT: csrr a1, vlenb
; CHECK-RV32VC-NEXT: vl8re32.v v8, (a0)
-; CHECK-RV32VC-NEXT: slli a3, a4, 3
-; CHECK-RV32VC-NEXT: slli a1, a4, 2
-; CHECK-RV32VC-NEXT: add a0, a0, a3
-; CHECK-RV32VC-NEXT: sub a3, a2, a1
+; CHECK-RV32VC-NEXT: slli a4, a1, 3
+; CHECK-RV32VC-NEXT: slli a3, a1, 2
+; CHECK-RV32VC-NEXT: slli a1, a1, 1
+; CHECK-RV32VC-NEXT: add a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a2, a3
+; CHECK-RV32VC-NEXT: sltu a5, a3, a2
; CHECK-RV32VC-NEXT: vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT: sltu a0, a2, a3
-; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a3, a3, a0
-; CHECK-RV32VC-NEXT: slli a0, a4, 1
-; CHECK-RV32VC-NEXT: sub a4, a3, a0
-; CHECK-RV32VC-NEXT: sltu a5, a3, a4
+; CHECK-RV32VC-NEXT: addi a0, a5, -1
+; CHECK-RV32VC-NEXT: and a0, a0, a4
+; CHECK-RV32VC-NEXT: sub a4, a0, a1
+; CHECK-RV32VC-NEXT: sltu a5, a1, a0
; CHECK-RV32VC-NEXT: addi a5, a5, -1
; CHECK-RV32VC-NEXT: and a4, a4, a5
-; CHECK-RV32VC-NEXT: bltu a3, a0, .LBB914_2
+; CHECK-RV32VC-NEXT: bltu a0, a1, .LBB914_2
; CHECK-RV32VC-NEXT: # %bb.1:
-; CHECK-RV32VC-NEXT: mv a3, a0
+; CHECK-RV32VC-NEXT: mv a0, a1
; CHECK-RV32VC-NEXT: .LBB914_2:
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v30, (zero), v0
-; CHECK-RV32VC-NEXT: vsetvli zero, a3, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v28, (zero), v8
-; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB914_4
+; CHECK-RV32VC-NEXT: bltu a2, a3, .LBB914_4
; CHECK-RV32VC-NEXT: # %bb.3:
-; CHECK-RV32VC-NEXT: mv a2, a1
+; CHECK-RV32VC-NEXT: mv a2, a3
; CHECK-RV32VC-NEXT: .LBB914_4:
-; CHECK-RV32VC-NEXT: sub a1, a2, a0
-; CHECK-RV32VC-NEXT: sltu a3, a2, a1
+; CHECK-RV32VC-NEXT: sub a0, a2, a1
+; CHECK-RV32VC-NEXT: sltu a3, a1, a2
; CHECK-RV32VC-NEXT: addi a3, a3, -1
-; CHECK-RV32VC-NEXT: and a1, a1, a3
-; CHECK-RV32VC-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32VC-NEXT: and a0, a0, a3
+; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vluxei32.v v26, (zero), v16
-; CHECK-RV32VC-NEXT: bltu a2, a0, .LBB914_6
+; CHECK-RV32VC-NEXT: bltu a2, a1, .LBB914_6
; CHECK-RV32VC-NEXT: # %bb.5:
-; CHECK-RV32VC-NEXT: mv a2, a0
+; CHECK-RV32VC-NEXT: mv a2, a1
; CHECK-RV32VC-NEXT: .LBB914_6:
; CHECK-RV32VC-NEXT: addi a0, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
@@ -37342,9 +37342,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -37360,17 +37360,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -37391,23 +37391,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -37424,7 +37424,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.p1
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -37493,13 +37493,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -37511,10 +37511,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.p1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -37609,102 +37609,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB915_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB915_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB915_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB915_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB915_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB915_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB915_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB915_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB915_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB915_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB915_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB915_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB915_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -37734,7 +37734,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.p1
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -37803,13 +37803,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -37821,10 +37821,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.p1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -37967,9 +37967,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -37985,17 +37985,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -38016,23 +38016,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -38049,7 +38049,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64V-NEXT: ntl.pall
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -38118,13 +38118,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -38136,10 +38136,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.pall
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -38234,102 +38234,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB916_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB916_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB916_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB916_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB916_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB916_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB916_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB916_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB916_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB916_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB916_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB916_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB916_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -38359,7 +38359,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV64VC-NEXT: c.ntl.pall
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -38428,13 +38428,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -38446,10 +38446,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.pall
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -38592,9 +38592,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -38610,17 +38610,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -38641,23 +38641,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -38674,7 +38674,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64V-NEXT: ntl.s1
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -38743,13 +38743,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -38761,10 +38761,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.s1
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -38859,102 +38859,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB917_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB917_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB917_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB917_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB917_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB917_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB917_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB917_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB917_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB917_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB917_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB917_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB917_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -38984,7 +38984,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV64VC-NEXT: c.ntl.s1
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -39053,13 +39053,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -39071,10 +39071,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.s1
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -39217,9 +39217,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -39235,17 +39235,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -39266,23 +39266,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -39299,7 +39299,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -39368,13 +39368,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -39386,10 +39386,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -39484,102 +39484,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB918_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB918_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB918_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB918_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB918_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB918_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB918_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB918_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB918_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB918_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB918_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB918_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB918_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -39609,7 +39609,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -39678,13 +39678,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -39696,10 +39696,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -39841,9 +39841,9 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64V-NEXT: sub a0, a4, s0
+; CHECK-RV64V-NEXT: sltu a4, s0, a4
; CHECK-RV64V-NEXT: sub a5, a3, a1
-; CHECK-RV64V-NEXT: sltu a4, a4, a0
-; CHECK-RV64V-NEXT: sltu a3, a3, a5
+; CHECK-RV64V-NEXT: sltu a3, a1, a3
; CHECK-RV64V-NEXT: addi a4, a4, -1
; CHECK-RV64V-NEXT: addi a3, a3, -1
; CHECK-RV64V-NEXT: and a4, a4, a0
@@ -39859,17 +39859,17 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64V-NEXT: sub a3, a0, s0
-; CHECK-RV64V-NEXT: sub a2, s1, a2
-; CHECK-RV64V-NEXT: sltu a0, a0, a3
-; CHECK-RV64V-NEXT: sltu a4, s1, a2
+; CHECK-RV64V-NEXT: sub a3, s1, a2
+; CHECK-RV64V-NEXT: sltu a2, a2, s1
+; CHECK-RV64V-NEXT: sub a4, a0, s0
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: addi a4, a4, -1
-; CHECK-RV64V-NEXT: and a3, a0, a3
-; CHECK-RV64V-NEXT: and a0, a4, a2
+; CHECK-RV64V-NEXT: addi a2, a2, -1
+; CHECK-RV64V-NEXT: and a4, a0, a4
+; CHECK-RV64V-NEXT: and a0, a2, a3
; CHECK-RV64V-NEXT: addi a2, sp, 16
; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64V-NEXT: mv a2, a0
@@ -39890,23 +39890,23 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v28, (zero), v8
-; CHECK-RV64V-NEXT: sub a3, a2, s0
-; CHECK-RV64V-NEXT: sub a1, a0, a1
-; CHECK-RV64V-NEXT: sltu a2, a2, a3
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sub a3, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, a1, a0
+; CHECK-RV64V-NEXT: sub a1, a2, s0
+; CHECK-RV64V-NEXT: sltu a2, s0, a2
; CHECK-RV64V-NEXT: addi a2, a2, -1
; CHECK-RV64V-NEXT: addi a0, a0, -1
-; CHECK-RV64V-NEXT: and a2, a2, a3
-; CHECK-RV64V-NEXT: and a0, a0, a1
-; CHECK-RV64V-NEXT: csrr a1, vlenb
-; CHECK-RV64V-NEXT: slli a1, a1, 3
-; CHECK-RV64V-NEXT: mv a3, a1
-; CHECK-RV64V-NEXT: slli a1, a1, 1
-; CHECK-RV64V-NEXT: add a1, a1, a3
-; CHECK-RV64V-NEXT: add a1, sp, a1
-; CHECK-RV64V-NEXT: addi a1, a1, 16
-; CHECK-RV64V-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64V-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: and a1, a2, a1
+; CHECK-RV64V-NEXT: and a0, a0, a3
+; CHECK-RV64V-NEXT: csrr a2, vlenb
+; CHECK-RV64V-NEXT: slli a2, a2, 3
+; CHECK-RV64V-NEXT: mv a3, a2
+; CHECK-RV64V-NEXT: slli a2, a2, 1
+; CHECK-RV64V-NEXT: add a2, a2, a3
+; CHECK-RV64V-NEXT: add a2, sp, a2
+; CHECK-RV64V-NEXT: addi a2, a2, 16
+; CHECK-RV64V-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64V-NEXT: vsetvli zero, a1, e8, m1, ta, ma
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v29, (zero), v8
; CHECK-RV64V-NEXT: mv a1, a0
@@ -39923,7 +39923,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64V-NEXT: ntl.all
; CHECK-RV64V-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64V-NEXT: sub a1, a0, s0
-; CHECK-RV64V-NEXT: sltu a0, a0, a1
+; CHECK-RV64V-NEXT: sltu a0, s0, a0
; CHECK-RV64V-NEXT: addi a0, a0, -1
; CHECK-RV64V-NEXT: and a0, a0, a1
; CHECK-RV64V-NEXT: csrr a1, vlenb
@@ -39992,13 +39992,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32V-NEXT: sub a0, a4, a1
-; CHECK-RV32V-NEXT: sub a2, a3, a2
-; CHECK-RV32V-NEXT: sltu a4, a4, a0
-; CHECK-RV32V-NEXT: sltu a3, a3, a2
+; CHECK-RV32V-NEXT: sltu a4, a1, a4
+; CHECK-RV32V-NEXT: sub a5, a3, a2
+; CHECK-RV32V-NEXT: sltu a2, a2, a3
; CHECK-RV32V-NEXT: addi a4, a4, -1
-; CHECK-RV32V-NEXT: addi a3, a3, -1
+; CHECK-RV32V-NEXT: addi a2, a2, -1
; CHECK-RV32V-NEXT: and a4, a4, a0
-; CHECK-RV32V-NEXT: and a0, a3, a2
+; CHECK-RV32V-NEXT: and a0, a2, a5
; CHECK-RV32V-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v10, (zero), v0
@@ -40010,10 +40010,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32V-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32V-NEXT: ntl.all
; CHECK-RV32V-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32V-NEXT: sub a1, a0, a1
-; CHECK-RV32V-NEXT: sltu a0, a0, a1
+; CHECK-RV32V-NEXT: sub a2, a0, a1
+; CHECK-RV32V-NEXT: sltu a0, a1, a0
; CHECK-RV32V-NEXT: addi a0, a0, -1
-; CHECK-RV32V-NEXT: and a0, a0, a1
+; CHECK-RV32V-NEXT: and a0, a0, a2
; CHECK-RV32V-NEXT: addi a1, sp, 16
; CHECK-RV32V-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
@@ -40108,102 +40108,102 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64VC-NEXT: mv a3, a6
; CHECK-RV64VC-NEXT: .LBB919_2:
; CHECK-RV64VC-NEXT: slli a5, s0, 4
-; CHECK-RV64VC-NEXT: slli a7, s0, 1
-; CHECK-RV64VC-NEXT: slli a2, s0, 3
+; CHECK-RV64VC-NEXT: slli a1, s0, 1
+; CHECK-RV64VC-NEXT: slli a7, s0, 3
; CHECK-RV64VC-NEXT: mv a4, a3
-; CHECK-RV64VC-NEXT: bltu a3, a7, .LBB919_4
+; CHECK-RV64VC-NEXT: bltu a3, a1, .LBB919_4
; CHECK-RV64VC-NEXT: # %bb.3:
-; CHECK-RV64VC-NEXT: mv a4, a7
+; CHECK-RV64VC-NEXT: mv a4, a1
; CHECK-RV64VC-NEXT: .LBB919_4:
; CHECK-RV64VC-NEXT: vl8re64.v v8, (s1)
-; CHECK-RV64VC-NEXT: add a1, s1, a0
+; CHECK-RV64VC-NEXT: add a2, s1, a0
; CHECK-RV64VC-NEXT: add a5, a5, s1
-; CHECK-RV64VC-NEXT: add a2, a2, s1
+; CHECK-RV64VC-NEXT: add a7, a7, s1
; CHECK-RV64VC-NEXT: mv a0, a4
; CHECK-RV64VC-NEXT: bltu a4, s0, .LBB919_6
; CHECK-RV64VC-NEXT: # %bb.5:
; CHECK-RV64VC-NEXT: mv a0, s0
; CHECK-RV64VC-NEXT: .LBB919_6:
-; CHECK-RV64VC-NEXT: vl8re64.v v16, (a1)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 3
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v16, (a2)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 3
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
; CHECK-RV64VC-NEXT: vl8re64.v v16, (a5)
-; CHECK-RV64VC-NEXT: addi a1, sp, 16
-; CHECK-RV64VC-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-RV64VC-NEXT: vl8re64.v v0, (a2)
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: mv a2, a1
-; CHECK-RV64VC-NEXT: slli a1, a1, 1
-; CHECK-RV64VC-NEXT: add a1, a1, a2
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: csrr a1, vlenb
-; CHECK-RV64VC-NEXT: slli a1, a1, 4
-; CHECK-RV64VC-NEXT: add a1, a1, sp
-; CHECK-RV64VC-NEXT: addi a1, a1, 16
-; CHECK-RV64VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: addi a2, sp, 16
+; CHECK-RV64VC-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-RV64VC-NEXT: vl8re64.v v0, (a7)
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: mv a5, a2
+; CHECK-RV64VC-NEXT: slli a2, a2, 1
+; CHECK-RV64VC-NEXT: add a2, a2, a5
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: csrr a2, vlenb
+; CHECK-RV64VC-NEXT: slli a2, a2, 4
+; CHECK-RV64VC-NEXT: add a2, a2, sp
+; CHECK-RV64VC-NEXT: addi a2, a2, 16
+; CHECK-RV64VC-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; CHECK-RV64VC-NEXT: vsetvli zero, a0, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v24, (zero), v16
; CHECK-RV64VC-NEXT: sub a0, a4, s0
-; CHECK-RV64VC-NEXT: sub a1, a3, a7
-; CHECK-RV64VC-NEXT: sltu a2, a4, a0
-; CHECK-RV64VC-NEXT: sltu a3, a3, a1
+; CHECK-RV64VC-NEXT: sltu a2, s0, a4
+; CHECK-RV64VC-NEXT: sub a4, a3, a1
+; CHECK-RV64VC-NEXT: sltu a3, a1, a3
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
; CHECK-RV64VC-NEXT: and a2, a2, a0
-; CHECK-RV64VC-NEXT: and a0, a3, a1
+; CHECK-RV64VC-NEXT: and a0, a3, a4
; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v25, (zero), v8
-; CHECK-RV64VC-NEXT: mv a1, a0
+; CHECK-RV64VC-NEXT: mv a2, a0
; CHECK-RV64VC-NEXT: bltu a0, s0, .LBB919_8
; CHECK-RV64VC-NEXT: # %bb.7:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a2, s0
; CHECK-RV64VC-NEXT: .LBB919_8:
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a2, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v26, (zero), v0
-; CHECK-RV64VC-NEXT: sub a1, a0, s0
; CHECK-RV64VC-NEXT: sub a2, s2, a6
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
-; CHECK-RV64VC-NEXT: sltu a3, s2, a2
+; CHECK-RV64VC-NEXT: sltu a3, a6, s2
+; CHECK-RV64VC-NEXT: sub a4, a0, s0
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: addi a3, a3, -1
-; CHECK-RV64VC-NEXT: and a1, a1, a0
+; CHECK-RV64VC-NEXT: and a4, a4, a0
; CHECK-RV64VC-NEXT: and a0, a3, a2
; CHECK-RV64VC-NEXT: addi a2, sp, 16
; CHECK-RV64VC-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: vsetvli zero, a4, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v27, (zero), v8
; CHECK-RV64VC-NEXT: mv a2, a0
-; CHECK-RV64VC-NEXT: bltu a0, a7, .LBB919_10
+; CHECK-RV64VC-NEXT: bltu a0, a1, .LBB919_10
; CHECK-RV64VC-NEXT: # %bb.9:
-; CHECK-RV64VC-NEXT: mv a2, a7
+; CHECK-RV64VC-NEXT: mv a2, a1
; CHECK-RV64VC-NEXT: .LBB919_10:
-; CHECK-RV64VC-NEXT: mv a1, a2
+; CHECK-RV64VC-NEXT: mv a3, a2
; CHECK-RV64VC-NEXT: bltu a2, s0, .LBB919_12
; CHECK-RV64VC-NEXT: # %bb.11:
-; CHECK-RV64VC-NEXT: mv a1, s0
+; CHECK-RV64VC-NEXT: mv a3, s0
; CHECK-RV64VC-NEXT: .LBB919_12:
-; CHECK-RV64VC-NEXT: csrr a3, vlenb
-; CHECK-RV64VC-NEXT: slli a3, a3, 3
-; CHECK-RV64VC-NEXT: add a3, a3, sp
-; CHECK-RV64VC-NEXT: addi a3, a3, 16
-; CHECK-RV64VC-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload
-; CHECK-RV64VC-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-RV64VC-NEXT: csrr a4, vlenb
+; CHECK-RV64VC-NEXT: slli a4, a4, 3
+; CHECK-RV64VC-NEXT: add a4, a4, sp
+; CHECK-RV64VC-NEXT: addi a4, a4, 16
+; CHECK-RV64VC-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
+; CHECK-RV64VC-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v28, (zero), v8
+; CHECK-RV64VC-NEXT: sub a3, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, a1, a0
; CHECK-RV64VC-NEXT: sub a1, a2, s0
-; CHECK-RV64VC-NEXT: sub a3, a0, a7
-; CHECK-RV64VC-NEXT: sltu a2, a2, a1
-; CHECK-RV64VC-NEXT: sltu a0, a0, a3
+; CHECK-RV64VC-NEXT: sltu a2, s0, a2
; CHECK-RV64VC-NEXT: addi a2, a2, -1
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a1, a1, a2
@@ -40233,7 +40233,7 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV64VC-NEXT: c.ntl.all
; CHECK-RV64VC-NEXT: vsoxei64.v v30, (zero), v8
; CHECK-RV64VC-NEXT: sub a1, a0, s0
-; CHECK-RV64VC-NEXT: sltu a0, a0, a1
+; CHECK-RV64VC-NEXT: sltu a0, s0, a0
; CHECK-RV64VC-NEXT: addi a0, a0, -1
; CHECK-RV64VC-NEXT: and a0, a0, a1
; CHECK-RV64VC-NEXT: csrr a1, vlenb
@@ -40302,13 +40302,13 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v8, (zero), v16
; CHECK-RV32VC-NEXT: sub a0, a4, a1
-; CHECK-RV32VC-NEXT: sub a2, a3, a2
-; CHECK-RV32VC-NEXT: sltu a4, a4, a0
-; CHECK-RV32VC-NEXT: sltu a3, a3, a2
+; CHECK-RV32VC-NEXT: sltu a4, a1, a4
+; CHECK-RV32VC-NEXT: sub a5, a3, a2
+; CHECK-RV32VC-NEXT: sltu a2, a2, a3
; CHECK-RV32VC-NEXT: addi a4, a4, -1
-; CHECK-RV32VC-NEXT: addi a3, a3, -1
+; CHECK-RV32VC-NEXT: addi a2, a2, -1
; CHECK-RV32VC-NEXT: and a4, a4, a0
-; CHECK-RV32VC-NEXT: and a0, a3, a2
+; CHECK-RV32VC-NEXT: and a0, a2, a5
; CHECK-RV32VC-NEXT: vsetvli zero, a4, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v10, (zero), v0
@@ -40320,10 +40320,10 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
; CHECK-RV32VC-NEXT: vsetvli zero, a2, e8, m2, ta, ma
; CHECK-RV32VC-NEXT: c.ntl.all
; CHECK-RV32VC-NEXT: vsoxei32.v v12, (zero), v24
-; CHECK-RV32VC-NEXT: sub a1, a0, a1
-; CHECK-RV32VC-NEXT: sltu a0, a0, a1
+; CHECK-RV32VC-NEXT: sub a2, a0, a1
+; CHECK-RV32VC-NEXT: sltu a0, a1, a0
; CHECK-RV32VC-NEXT: addi a0, a0, -1
-; CHECK-RV32VC-NEXT: and a0, a0, a1
+; CHECK-RV32VC-NEXT: and a0, a0, a2
; CHECK-RV32VC-NEXT: addi a1, sp, 16
; CHECK-RV32VC-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; CHECK-RV32VC-NEXT: vsetvli zero, a0, e8, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 380287dd555c9..1c95c753c8ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -263,13 +263,13 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -321,14 +321,14 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat>
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -786,13 +786,13 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -858,14 +858,14 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1751,7 +1751,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1793,7 +1793,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -1834,7 +1834,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFMIN-NEXT: sub a2, a0, a1
; RV32ZVFMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFMIN-NEXT: sltu a3, a1, a0
; RV32ZVFMIN-NEXT: addi a3, a3, -1
; RV32ZVFMIN-NEXT: and a2, a3, a2
; RV32ZVFMIN-NEXT: vmv1r.v v0, v6
@@ -1876,7 +1876,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -1917,7 +1917,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1947,7 +1947,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -1976,7 +1976,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFMIN-NEXT: sub a3, a0, a1
; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFMIN-NEXT: sltu a2, a1, a0
; RV32ZVFMIN-NEXT: addi a2, a2, -1
; RV32ZVFMIN-NEXT: and a2, a2, a3
; RV32ZVFMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2006,7 +2006,7 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 37c036d38148a..605b07c81f45a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index 37a9ec1c0a8aa..6869bc2050698 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 5553b988fec97..8869a440c8634 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -283,13 +283,13 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v6, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v5, v6
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v6
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -345,14 +345,14 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v7, v16, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vmv1r.v v6, v7
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v16, v24, v0.t
@@ -856,13 +856,13 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v5, v6
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -934,14 +934,14 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: fmv.w.x fa5, a3
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v7, v16, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vmv1r.v v6, v7
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
+; ZVFHMIN-NEXT: and a3, a4, a3
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t
@@ -1931,7 +1931,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFH-NEXT: sub a2, a0, a1
; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFH-NEXT: sltu a3, a0, a2
+; RV32ZVFH-NEXT: sltu a3, a1, a0
; RV32ZVFH-NEXT: addi a3, a3, -1
; RV32ZVFH-NEXT: and a2, a3, a2
; RV32ZVFH-NEXT: vmv1r.v v0, v6
@@ -1977,7 +1977,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vmv1r.v v0, v6
@@ -2022,7 +2022,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2)
; RV32ZVFHMIN-NEXT: sub a2, a0, a1
; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; RV32ZVFHMIN-NEXT: sltu a3, a0, a2
+; RV32ZVFHMIN-NEXT: sltu a3, a1, a0
; RV32ZVFHMIN-NEXT: addi a3, a3, -1
; RV32ZVFHMIN-NEXT: and a2, a3, a2
; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2068,7 +2068,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6
@@ -2113,7 +2113,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFH-NEXT: sub a3, a0, a1
; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFH-NEXT: sltu a2, a0, a3
+; RV32ZVFH-NEXT: sltu a2, a1, a0
; RV32ZVFH-NEXT: addi a2, a2, -1
; RV32ZVFH-NEXT: and a2, a2, a3
; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2147,7 +2147,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV64ZVFH-NEXT: sub a3, a0, a1
; RV64ZVFH-NEXT: slli a2, a2, 52
; RV64ZVFH-NEXT: fmv.d.x fa5, a2
-; RV64ZVFH-NEXT: sltu a2, a0, a3
+; RV64ZVFH-NEXT: sltu a2, a1, a0
; RV64ZVFH-NEXT: addi a2, a2, -1
; RV64ZVFH-NEXT: and a2, a2, a3
; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2180,7 +2180,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0)
; RV32ZVFHMIN-NEXT: sub a3, a0, a1
; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2)
-; RV32ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV32ZVFHMIN-NEXT: sltu a2, a1, a0
; RV32ZVFHMIN-NEXT: addi a2, a2, -1
; RV32ZVFHMIN-NEXT: and a2, a2, a3
; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2214,7 +2214,7 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
; RV64ZVFHMIN-NEXT: sub a3, a0, a1
; RV64ZVFHMIN-NEXT: slli a2, a2, 52
; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2
-; RV64ZVFHMIN-NEXT: sltu a2, a0, a3
+; RV64ZVFHMIN-NEXT: sltu a2, a1, a0
; RV64ZVFHMIN-NEXT: addi a2, a2, -1
; RV64ZVFHMIN-NEXT: and a2, a2, a3
; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 634e58198def3..b67ab5c3c9efa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1337,211 +1337,404 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
}
define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: fcmp_oeq_vv_nxv64bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 2
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv8r.v v0, v16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: mv a3, a1
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, a1, a3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a1, a3, 3
-; CHECK-NEXT: slli a5, a3, 2
-; CHECK-NEXT: slli a4, a3, 1
-; CHECK-NEXT: add a1, a0, a1
-; CHECK-NEXT: sub a6, a2, a5
-; CHECK-NEXT: vl8re16.v v24, (a1)
-; CHECK-NEXT: sltu a1, a2, a6
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a6, a1, a6
-; CHECK-NEXT: sub a1, a6, a4
-; CHECK-NEXT: sltu a7, a6, a1
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: and a7, a7, a1
-; CHECK-NEXT: srli a1, a3, 1
-; CHECK-NEXT: srli a3, a3, 2
-; CHECK-NEXT: csrr t0, vlenb
-; CHECK-NEXT: slli t0, t0, 1
-; CHECK-NEXT: mv t1, t0
-; CHECK-NEXT: slli t0, t0, 2
-; CHECK-NEXT: add t1, t1, t0
-; CHECK-NEXT: slli t0, t0, 1
-; CHECK-NEXT: add t0, t0, t1
-; CHECK-NEXT: add t0, sp, t0
-; CHECK-NEXT: addi t0, t0, 16
-; CHECK-NEXT: vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vslidedown.vx v16, v8, a1
-; CHECK-NEXT: vl8re16.v v8, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv t0, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, t0
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v16, a3
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a7, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT: bltu a6, a4, .LBB85_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a6, a4
-; CHECK-NEXT: .LBB85_2:
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v5, v8, v16, v0.t
-; CHECK-NEXT: vsetvli zero, a6, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v7, v24, v16, v0.t
-; CHECK-NEXT: bltu a2, a5, .LBB85_4
-; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv a2, a5
-; CHECK-NEXT: .LBB85_4:
-; CHECK-NEXT: sub a0, a2, a4
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 1
-; CHECK-NEXT: mv a6, a5
-; CHECK-NEXT: slli a5, a5, 2
-; CHECK-NEXT: add a6, a6, a5
-; CHECK-NEXT: slli a5, a5, 1
-; CHECK-NEXT: add a5, a5, a6
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v8, a3
-; CHECK-NEXT: sltu a5, a2, a0
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a0, a5, a0
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 1
-; CHECK-NEXT: mv a6, a5
-; CHECK-NEXT: slli a5, a5, 3
-; CHECK-NEXT: add a5, a5, a6
-; CHECK-NEXT: add a5, sp, a5
-; CHECK-NEXT: addi a5, a5, 16
-; CHECK-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a5, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, a5
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v10, v16, v24, v0.t
-; CHECK-NEXT: vmv1r.v v9, v7
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v5, a3
-; CHECK-NEXT: bltu a2, a4, .LBB85_6
-; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a2, a4
-; CHECK-NEXT: .LBB85_6:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a4, a0
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, a0, a4
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a2, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: mv a2, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a2, a2, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v10, a3
-; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a1, a1, a0
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add a1, a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; CHECK32-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK32: # %bb.0:
+; CHECK32-NEXT: addi sp, sp, -16
+; CHECK32-NEXT: .cfi_def_cfa_offset 16
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: mv a3, a1
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: add a3, a3, a1
+; CHECK32-NEXT: slli a1, a1, 2
+; CHECK32-NEXT: add a3, a3, a1
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: add a1, a1, a3
+; CHECK32-NEXT: sub sp, sp, a1
+; CHECK32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: mv a3, a1
+; CHECK32-NEXT: slli a1, a1, 2
+; CHECK32-NEXT: add a3, a3, a1
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: add a1, a1, a3
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK32-NEXT: vmv8r.v v0, v16
+; CHECK32-NEXT: csrr a1, vlenb
+; CHECK32-NEXT: slli a1, a1, 1
+; CHECK32-NEXT: mv a3, a1
+; CHECK32-NEXT: slli a1, a1, 3
+; CHECK32-NEXT: add a1, a1, a3
+; CHECK32-NEXT: add a1, sp, a1
+; CHECK32-NEXT: addi a1, a1, 16
+; CHECK32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: csrr a3, vlenb
+; CHECK32-NEXT: srli a1, a3, 1
+; CHECK32-NEXT: slli a4, a3, 3
+; CHECK32-NEXT: slli a6, a3, 2
+; CHECK32-NEXT: slli a5, a3, 1
+; CHECK32-NEXT: add a4, a0, a4
+; CHECK32-NEXT: sub a7, a2, a6
+; CHECK32-NEXT: sltu t0, a6, a2
+; CHECK32-NEXT: vl8re16.v v24, (a4)
+; CHECK32-NEXT: addi t0, t0, -1
+; CHECK32-NEXT: and a7, t0, a7
+; CHECK32-NEXT: sub a4, a7, a5
+; CHECK32-NEXT: sltu t0, a5, a7
+; CHECK32-NEXT: addi t0, t0, -1
+; CHECK32-NEXT: and t0, t0, a4
+; CHECK32-NEXT: srli a4, a3, 2
+; CHECK32-NEXT: csrr t1, vlenb
+; CHECK32-NEXT: slli t1, t1, 1
+; CHECK32-NEXT: mv t2, t1
+; CHECK32-NEXT: slli t1, t1, 2
+; CHECK32-NEXT: add t2, t2, t1
+; CHECK32-NEXT: slli t1, t1, 1
+; CHECK32-NEXT: add t1, t1, t2
+; CHECK32-NEXT: add t1, sp, t1
+; CHECK32-NEXT: addi t1, t1, 16
+; CHECK32-NEXT: vl1r.v v8, (t1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vslidedown.vx v16, v8, a1
+; CHECK32-NEXT: vl8re16.v v8, (a0)
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv t1, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a0, a0, t1
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: vslidedown.vx v8, v16, a4
+; CHECK32-NEXT: addi a0, sp, 16
+; CHECK32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT: vsetvli zero, t0, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v8, v4
+; CHECK32-NEXT: bltu a7, a5, .LBB85_2
+; CHECK32-NEXT: # %bb.1:
+; CHECK32-NEXT: mv a7, a5
+; CHECK32-NEXT: .LBB85_2:
+; CHECK32-NEXT: addi a0, sp, 16
+; CHECK32-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v5, v8, v16, v0.t
+; CHECK32-NEXT: vsetvli zero, a7, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v7, v24, v16, v0.t
+; CHECK32-NEXT: bltu a2, a6, .LBB85_4
+; CHECK32-NEXT: # %bb.3:
+; CHECK32-NEXT: mv a2, a6
+; CHECK32-NEXT: .LBB85_4:
+; CHECK32-NEXT: sub a0, a2, a5
+; CHECK32-NEXT: sltu a6, a5, a2
+; CHECK32-NEXT: csrr a7, vlenb
+; CHECK32-NEXT: slli a7, a7, 1
+; CHECK32-NEXT: mv t0, a7
+; CHECK32-NEXT: slli a7, a7, 2
+; CHECK32-NEXT: add t0, t0, a7
+; CHECK32-NEXT: slli a7, a7, 1
+; CHECK32-NEXT: add a7, a7, t0
+; CHECK32-NEXT: add a7, sp, a7
+; CHECK32-NEXT: addi a7, a7, 16
+; CHECK32-NEXT: vl1r.v v8, (a7) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli a7, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: vslidedown.vx v0, v8, a4
+; CHECK32-NEXT: addi a6, a6, -1
+; CHECK32-NEXT: and a0, a6, a0
+; CHECK32-NEXT: csrr a6, vlenb
+; CHECK32-NEXT: slli a6, a6, 1
+; CHECK32-NEXT: mv a7, a6
+; CHECK32-NEXT: slli a6, a6, 3
+; CHECK32-NEXT: add a6, a6, a7
+; CHECK32-NEXT: add a6, sp, a6
+; CHECK32-NEXT: addi a6, a6, 16
+; CHECK32-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a6, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a0, a0, a6
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v10, v16, v24, v0.t
+; CHECK32-NEXT: vmv1r.v v9, v7
+; CHECK32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT: vslideup.vx v9, v5, a4
+; CHECK32-NEXT: bltu a2, a5, .LBB85_6
+; CHECK32-NEXT: # %bb.5:
+; CHECK32-NEXT: mv a2, a5
+; CHECK32-NEXT: .LBB85_6:
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a5, a0
+; CHECK32-NEXT: slli a0, a0, 3
+; CHECK32-NEXT: add a0, a0, a5
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a2, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a0, a0, a2
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT: vfwcvtbf16.f.f.v v24, v0
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: mv a2, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a2, a2, a0
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, a0, a2
+; CHECK32-NEXT: add a0, sp, a0
+; CHECK32-NEXT: addi a0, a0, 16
+; CHECK32-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK32-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK32-NEXT: vslideup.vx v8, v10, a4
+; CHECK32-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; CHECK32-NEXT: vslideup.vx v8, v9, a1
+; CHECK32-NEXT: vmv.v.v v0, v8
+; CHECK32-NEXT: csrr a0, vlenb
+; CHECK32-NEXT: mv a1, a0
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a1, a1, a0
+; CHECK32-NEXT: slli a0, a0, 2
+; CHECK32-NEXT: add a1, a1, a0
+; CHECK32-NEXT: slli a0, a0, 1
+; CHECK32-NEXT: add a0, a0, a1
+; CHECK32-NEXT: add sp, sp, a0
+; CHECK32-NEXT: .cfi_def_cfa sp, 16
+; CHECK32-NEXT: addi sp, sp, 16
+; CHECK32-NEXT: .cfi_def_cfa_offset 0
+; CHECK32-NEXT: ret
+;
+; CHECK64-LABEL: fcmp_oeq_vv_nxv64bf16:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: addi sp, sp, -16
+; CHECK64-NEXT: .cfi_def_cfa_offset 16
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: mv a3, a1
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: add a3, a3, a1
+; CHECK64-NEXT: slli a1, a1, 2
+; CHECK64-NEXT: add a3, a3, a1
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: add a1, a1, a3
+; CHECK64-NEXT: sub sp, sp, a1
+; CHECK64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: mv a3, a1
+; CHECK64-NEXT: slli a1, a1, 3
+; CHECK64-NEXT: add a1, a1, a3
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK64-NEXT: vmv8r.v v0, v16
+; CHECK64-NEXT: csrr a1, vlenb
+; CHECK64-NEXT: mv a3, a1
+; CHECK64-NEXT: slli a1, a1, 1
+; CHECK64-NEXT: add a3, a3, a1
+; CHECK64-NEXT: slli a1, a1, 3
+; CHECK64-NEXT: add a1, a1, a3
+; CHECK64-NEXT: add a1, sp, a1
+; CHECK64-NEXT: addi a1, a1, 16
+; CHECK64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: csrr a3, vlenb
+; CHECK64-NEXT: slli a1, a3, 3
+; CHECK64-NEXT: slli a5, a3, 2
+; CHECK64-NEXT: slli a4, a3, 1
+; CHECK64-NEXT: add a1, a0, a1
+; CHECK64-NEXT: sub a6, a2, a5
+; CHECK64-NEXT: sltu a7, a5, a2
+; CHECK64-NEXT: vl8re16.v v24, (a1)
+; CHECK64-NEXT: addi a7, a7, -1
+; CHECK64-NEXT: and a6, a7, a6
+; CHECK64-NEXT: sub a1, a6, a4
+; CHECK64-NEXT: sltu a7, a4, a6
+; CHECK64-NEXT: addi a7, a7, -1
+; CHECK64-NEXT: and a7, a7, a1
+; CHECK64-NEXT: srli a1, a3, 1
+; CHECK64-NEXT: srli a3, a3, 2
+; CHECK64-NEXT: csrr t0, vlenb
+; CHECK64-NEXT: slli t0, t0, 1
+; CHECK64-NEXT: mv t1, t0
+; CHECK64-NEXT: slli t0, t0, 3
+; CHECK64-NEXT: add t0, t0, t1
+; CHECK64-NEXT: add t0, sp, t0
+; CHECK64-NEXT: addi t0, t0, 16
+; CHECK64-NEXT: vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vslidedown.vx v16, v8, a1
+; CHECK64-NEXT: vl8re16.v v8, (a0)
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv t0, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a0, a0, t0
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslidedown.vx v8, v16, a3
+; CHECK64-NEXT: addi a0, sp, 16
+; CHECK64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT: vsetvli zero, a7, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v8, v4
+; CHECK64-NEXT: bltu a6, a4, .LBB85_2
+; CHECK64-NEXT: # %bb.1:
+; CHECK64-NEXT: mv a6, a4
+; CHECK64-NEXT: .LBB85_2:
+; CHECK64-NEXT: addi a0, sp, 16
+; CHECK64-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v5, v8, v16, v0.t
+; CHECK64-NEXT: vsetvli zero, a6, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v6, v24, v16, v0.t
+; CHECK64-NEXT: bltu a2, a5, .LBB85_4
+; CHECK64-NEXT: # %bb.3:
+; CHECK64-NEXT: mv a2, a5
+; CHECK64-NEXT: .LBB85_4:
+; CHECK64-NEXT: sub a0, a2, a4
+; CHECK64-NEXT: sltu a5, a4, a2
+; CHECK64-NEXT: csrr a6, vlenb
+; CHECK64-NEXT: slli a6, a6, 1
+; CHECK64-NEXT: mv a7, a6
+; CHECK64-NEXT: slli a6, a6, 3
+; CHECK64-NEXT: add a6, a6, a7
+; CHECK64-NEXT: add a6, sp, a6
+; CHECK64-NEXT: addi a6, a6, 16
+; CHECK64-NEXT: vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT: vsetvli a6, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslidedown.vx v0, v7, a3
+; CHECK64-NEXT: addi a5, a5, -1
+; CHECK64-NEXT: and a0, a5, a0
+; CHECK64-NEXT: csrr a5, vlenb
+; CHECK64-NEXT: mv a6, a5
+; CHECK64-NEXT: slli a5, a5, 1
+; CHECK64-NEXT: add a6, a6, a5
+; CHECK64-NEXT: slli a5, a5, 3
+; CHECK64-NEXT: add a5, a5, a6
+; CHECK64-NEXT: add a5, sp, a5
+; CHECK64-NEXT: addi a5, a5, 16
+; CHECK64-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v28
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv a5, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a0, a0, a5
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v24, v12
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v4, v16, v24, v0.t
+; CHECK64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslideup.vx v6, v5, a3
+; CHECK64-NEXT: bltu a2, a4, .LBB85_6
+; CHECK64-NEXT: # %bb.5:
+; CHECK64-NEXT: mv a2, a4
+; CHECK64-NEXT: .LBB85_6:
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: mv a4, a0
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a4, a4, a0
+; CHECK64-NEXT: slli a0, a0, 3
+; CHECK64-NEXT: add a0, a0, a4
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v16, v24
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: mv a2, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a0, a0, a2
+; CHECK64-NEXT: add a0, sp, a0
+; CHECK64-NEXT: addi a0, a0, 16
+; CHECK64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT: vfwcvtbf16.f.f.v v24, v8
+; CHECK64-NEXT: vmv1r.v v0, v7
+; CHECK64-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK64-NEXT: vmfeq.vv v8, v16, v24, v0.t
+; CHECK64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK64-NEXT: vslideup.vx v8, v4, a3
+; CHECK64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK64-NEXT: vslideup.vx v8, v6, a1
+; CHECK64-NEXT: vmv.v.v v0, v8
+; CHECK64-NEXT: csrr a0, vlenb
+; CHECK64-NEXT: mv a1, a0
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a1, a1, a0
+; CHECK64-NEXT: slli a0, a0, 2
+; CHECK64-NEXT: add a1, a1, a0
+; CHECK64-NEXT: slli a0, a0, 1
+; CHECK64-NEXT: add a0, a0, a1
+; CHECK64-NEXT: add sp, sp, a0
+; CHECK64-NEXT: .cfi_def_cfa sp, 16
+; CHECK64-NEXT: addi sp, sp, 16
+; CHECK64-NEXT: .cfi_def_cfa_offset 0
+; CHECK64-NEXT: ret
%v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64bf16(<vscale x 64 x bfloat> %va, <vscale x 64 x bfloat> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
ret <vscale x 64 x i1> %v
}
@@ -3479,257 +3672,6 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
}
define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: .cfi_def_cfa_offset 16
-; ZVFH-NEXT: csrr a1, vlenb
-; ZVFH-NEXT: slli a1, a1, 3
-; ZVFH-NEXT: sub sp, sp, a1
-; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; ZVFH-NEXT: vmv1r.v v7, v0
-; ZVFH-NEXT: addi a1, sp, 16
-; ZVFH-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: csrr a3, vlenb
-; ZVFH-NEXT: srli a1, a3, 1
-; ZVFH-NEXT: slli a4, a3, 3
-; ZVFH-NEXT: slli a3, a3, 2
-; ZVFH-NEXT: add a4, a0, a4
-; ZVFH-NEXT: sub a5, a2, a3
-; ZVFH-NEXT: vl8re16.v v24, (a4)
-; ZVFH-NEXT: sltu a4, a2, a5
-; ZVFH-NEXT: addi a4, a4, -1
-; ZVFH-NEXT: vl8re16.v v8, (a0)
-; ZVFH-NEXT: vslidedown.vx v0, v0, a1
-; ZVFH-NEXT: and a4, a4, a5
-; ZVFH-NEXT: vsetvli zero, a4, e16, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v6, v16, v24, v0.t
-; ZVFH-NEXT: bltu a2, a3, .LBB171_2
-; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: mv a2, a3
-; ZVFH-NEXT: .LBB171_2:
-; ZVFH-NEXT: vmv1r.v v0, v7
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; ZVFH-NEXT: vmfeq.vv v16, v24, v8, v0.t
-; ZVFH-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; ZVFH-NEXT: vslideup.vx v16, v6, a1
-; ZVFH-NEXT: vmv.v.v v0, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: .cfi_def_cfa sp, 16
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: .cfi_def_cfa_offset 0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv64f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a3, a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a3, a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a3, a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v0, v16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: mv a3, a1
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, a1, a3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a1, a3, 3
-; ZVFHMIN-NEXT: slli a5, a3, 2
-; ZVFHMIN-NEXT: slli a4, a3, 1
-; ZVFHMIN-NEXT: add a1, a0, a1
-; ZVFHMIN-NEXT: sub a6, a2, a5
-; ZVFHMIN-NEXT: vl8re16.v v24, (a1)
-; ZVFHMIN-NEXT: sltu a1, a2, a6
-; ZVFHMIN-NEXT: addi a1, a1, -1
-; ZVFHMIN-NEXT: and a6, a1, a6
-; ZVFHMIN-NEXT: sub a1, a6, a4
-; ZVFHMIN-NEXT: sltu a7, a6, a1
-; ZVFHMIN-NEXT: addi a7, a7, -1
-; ZVFHMIN-NEXT: and a7, a7, a1
-; ZVFHMIN-NEXT: srli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: csrr t0, vlenb
-; ZVFHMIN-NEXT: slli t0, t0, 1
-; ZVFHMIN-NEXT: mv t1, t0
-; ZVFHMIN-NEXT: slli t0, t0, 2
-; ZVFHMIN-NEXT: add t1, t1, t0
-; ZVFHMIN-NEXT: slli t0, t0, 1
-; ZVFHMIN-NEXT: add t0, t0, t1
-; ZVFHMIN-NEXT: add t0, sp, t0
-; ZVFHMIN-NEXT: addi t0, t0, 16
-; ZVFHMIN-NEXT: vl1r.v v8, (t0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vslidedown.vx v16, v8, a1
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv t0, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, t0
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v8, v16, a3
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a7, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT: bltu a6, a4, .LBB171_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a6, a4
-; ZVFHMIN-NEXT: .LBB171_2:
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v5, v8, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, a6, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v16, v0.t
-; ZVFHMIN-NEXT: bltu a2, a5, .LBB171_4
-; ZVFHMIN-NEXT: # %bb.3:
-; ZVFHMIN-NEXT: mv a2, a5
-; ZVFHMIN-NEXT: .LBB171_4:
-; ZVFHMIN-NEXT: sub a0, a2, a4
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 1
-; ZVFHMIN-NEXT: mv a6, a5
-; ZVFHMIN-NEXT: slli a5, a5, 2
-; ZVFHMIN-NEXT: add a6, a6, a5
-; ZVFHMIN-NEXT: slli a5, a5, 1
-; ZVFHMIN-NEXT: add a5, a5, a6
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a5, a2, a0
-; ZVFHMIN-NEXT: addi a5, a5, -1
-; ZVFHMIN-NEXT: and a0, a5, a0
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 1
-; ZVFHMIN-NEXT: mv a6, a5
-; ZVFHMIN-NEXT: slli a5, a5, 3
-; ZVFHMIN-NEXT: add a5, a5, a6
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a5, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, a5
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v10, v16, v24, v0.t
-; ZVFHMIN-NEXT: vmv1r.v v9, v7
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v9, v5, a3
-; ZVFHMIN-NEXT: bltu a2, a4, .LBB171_6
-; ZVFHMIN-NEXT: # %bb.5:
-; ZVFHMIN-NEXT: mv a2, a4
-; ZVFHMIN-NEXT: .LBB171_6:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a4, a0
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, a0, a4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a2, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, a2
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: mv a2, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a2, a2, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a2
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v8, v10, a3
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT: vslideup.vx v8, v9, a1
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a1, a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a1, a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%v = call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64f16(<vscale x 64 x half> %va, <vscale x 64 x half> %vb, metadata !"oeq", <vscale x 64 x i1> %m, i32 %evl)
ret <vscale x 64 x i1> %v
}
@@ -4879,7 +4821,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: add a4, s3, a6
; CHECK32-NEXT: vl8re64.v v24, (s3)
; CHECK32-NEXT: sub a6, a3, s0
-; CHECK32-NEXT: sltu a7, a3, a6
+; CHECK32-NEXT: sltu a7, s0, a3
; CHECK32-NEXT: addi a7, a7, -1
; CHECK32-NEXT: and a6, a7, a6
; CHECK32-NEXT: csrr a7, vlenb
@@ -4919,7 +4861,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK32-NEXT: vl8re64.v v16, (a4)
; CHECK32-NEXT: sub a1, s1, a2
-; CHECK32-NEXT: sltu a2, s1, a1
+; CHECK32-NEXT: sltu a2, a2, s1
; CHECK32-NEXT: vl8re64.v v24, (s2)
; CHECK32-NEXT: addi a2, a2, -1
; CHECK32-NEXT: and s1, a2, a1
@@ -4964,7 +4906,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
; CHECK32-NEXT: vslideup.vx v9, v8, s4
; CHECK32-NEXT: sub a1, s1, s0
-; CHECK32-NEXT: sltu a2, s1, a1
+; CHECK32-NEXT: sltu a2, s0, s1
; CHECK32-NEXT: addi a2, a2, -1
; CHECK32-NEXT: and a1, a2, a1
; CHECK32-NEXT: csrr a2, vlenb
@@ -4979,7 +4921,8 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t
-; CHECK32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK32-NEXT: srli s0, s0, 1
+; CHECK32-NEXT: vsetvli zero, s0, e8, mf2, ta, ma
; CHECK32-NEXT: vslideup.vx v9, v8, a0
; CHECK32-NEXT: vmv1r.v v0, v9
; CHECK32-NEXT: csrr a0, vlenb
@@ -5090,7 +5033,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK64-NEXT: add a4, s3, a6
; CHECK64-NEXT: vl8re64.v v24, (s3)
; CHECK64-NEXT: sub a6, a3, s0
-; CHECK64-NEXT: sltu a7, a3, a6
+; CHECK64-NEXT: sltu a7, s0, a3
; CHECK64-NEXT: addi a7, a7, -1
; CHECK64-NEXT: and a6, a7, a6
; CHECK64-NEXT: csrr a7, vlenb
@@ -5130,7 +5073,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; CHECK64-NEXT: vl8re64.v v16, (a4)
; CHECK64-NEXT: sub a1, s1, a2
-; CHECK64-NEXT: sltu a2, s1, a1
+; CHECK64-NEXT: sltu a2, a2, s1
; CHECK64-NEXT: vl8re64.v v24, (s2)
; CHECK64-NEXT: addi a2, a2, -1
; CHECK64-NEXT: and s1, a2, a1
@@ -5175,7 +5118,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
; CHECK64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
; CHECK64-NEXT: vslideup.vx v9, v8, s4
; CHECK64-NEXT: sub a1, s1, s0
-; CHECK64-NEXT: sltu a2, s1, a1
+; CHECK64-NEXT: sltu a2, s0, s1
; CHECK64-NEXT: addi a2, a2, -1
; CHECK64-NEXT: and a1, a2, a1
; CHECK64-NEXT: csrr a2, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index c1de57bf850ac..829a3b43bd984 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1083,7 +1083,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: sub a4, a3, a1
; CHECK-NEXT: vl8r.v v24, (a2)
-; CHECK-NEXT: sltu a2, a3, a4
+; CHECK-NEXT: sltu a2, a1, a3
; CHECK-NEXT: vl8r.v v8, (a0)
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a4
@@ -1120,7 +1120,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -1150,7 +1150,7 @@ define <vscale x 128 x i1> @icmp_eq_vx_swap_nxv128i8(<vscale x 128 x i8> %va, i8
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -2195,81 +2195,155 @@ define <vscale x 8 x i1> @icmp_sle_vi_swap_nxv8i32(<vscale x 8 x i32> %va, <vsca
}
define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vv_nxv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: srli a1, a3, 2
-; CHECK-NEXT: slli a4, a3, 3
-; CHECK-NEXT: slli a3, a3, 1
-; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: sub a5, a2, a3
-; CHECK-NEXT: vl8re32.v v24, (a4)
-; CHECK-NEXT: sltu a4, a2, a5
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: vl8re32.v v8, (a0)
-; CHECK-NEXT: vslidedown.vx v0, v0, a1
-; CHECK-NEXT: and a4, a4, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vv v6, v16, v24, v0.t
-; CHECK-NEXT: bltu a2, a3, .LBB189_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a2, a3
-; CHECK-NEXT: .LBB189_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vv v16, v24, v8, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v16, v6, a1
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
+; RV32-LABEL: icmp_eq_vv_nxv32i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v7, v0
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a1, a3, 2
+; RV32-NEXT: slli a5, a3, 3
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: sub a6, a2, a4
+; RV32-NEXT: vl8re32.v v24, (a5)
+; RV32-NEXT: sltu a5, a4, a2
+; RV32-NEXT: addi a5, a5, -1
+; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: vslidedown.vx v0, v0, a1
+; RV32-NEXT: and a0, a5, a6
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vv v6, v16, v24, v0.t
+; RV32-NEXT: bltu a2, a4, .LBB189_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a2, a4
+; RV32-NEXT: .LBB189_2:
+; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vv v16, v24, v8, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v16, v6, a1
+; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: icmp_eq_vv_nxv32i32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 3
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v7, v0
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a1, a3, 2
+; RV64-NEXT: slli a4, a3, 3
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: add a4, a0, a4
+; RV64-NEXT: sub a5, a2, a3
+; RV64-NEXT: vl8re32.v v24, (a4)
+; RV64-NEXT: sltu a4, a3, a2
+; RV64-NEXT: addi a4, a4, -1
+; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: vslidedown.vx v0, v0, a1
+; RV64-NEXT: and a4, a4, a5
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vv v6, v16, v24, v0.t
+; RV64-NEXT: bltu a2, a3, .LBB189_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a2, a3
+; RV64-NEXT: .LBB189_2:
+; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vv v16, v24, v8, v0.t
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v16, v6, a1
+; RV64-NEXT: vmv1r.v v0, v16
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 16
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
%v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x i1> %v
}
define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_nxv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv1r.v v24, v0
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: srli a2, a3, 2
-; CHECK-NEXT: slli a3, a3, 1
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sub a4, a1, a3
-; CHECK-NEXT: sltu a5, a1, a4
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a4, a5, a4
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT: bltu a1, a3, .LBB190_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB190_2:
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v16, v25, a2
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: ret
+; RV32-LABEL: icmp_eq_vx_nxv32i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v24, v0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a2, a3, 2
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: vslidedown.vx v0, v0, a2
+; RV32-NEXT: sub a5, a1, a4
+; RV32-NEXT: sltu a6, a4, a1
+; RV32-NEXT: addi a6, a6, -1
+; RV32-NEXT: and a5, a6, a5
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT: bltu a1, a4, .LBB190_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a1, a4
+; RV32-NEXT: .LBB190_2:
+; RV32-NEXT: vmv1r.v v0, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v16, v25, a2
+; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: icmp_eq_vx_nxv32i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v24, v0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a2, a3, 2
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: vslidedown.vx v0, v0, a2
+; RV64-NEXT: sub a4, a1, a3
+; RV64-NEXT: sltu a5, a3, a1
+; RV64-NEXT: addi a5, a5, -1
+; RV64-NEXT: and a4, a5, a4
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT: bltu a1, a3, .LBB190_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a1, a3
+; RV64-NEXT: .LBB190_2:
+; RV64-NEXT: vmv1r.v v0, v24
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v16, v25, a2
+; RV64-NEXT: vmv1r.v v0, v16
+; RV64-NEXT: ret
%elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
%vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
@@ -2277,31 +2351,58 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
}
define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: icmp_eq_vx_swap_nxv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vmv1r.v v24, v0
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: srli a2, a3, 2
-; CHECK-NEXT: slli a3, a3, 1
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sub a4, a1, a3
-; CHECK-NEXT: sltu a5, a1, a4
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a4, a5, a4
-; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
-; CHECK-NEXT: bltu a1, a3, .LBB191_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a1, a3
-; CHECK-NEXT: .LBB191_2:
-; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v16, v25, a2
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: ret
+; RV32-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: vmv1r.v v24, v0
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: srli a2, a3, 2
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: vslidedown.vx v0, v0, a2
+; RV32-NEXT: sub a5, a1, a4
+; RV32-NEXT: sltu a6, a4, a1
+; RV32-NEXT: addi a6, a6, -1
+; RV32-NEXT: and a5, a6, a5
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV32-NEXT: bltu a1, a4, .LBB191_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a1, a4
+; RV32-NEXT: .LBB191_2:
+; RV32-NEXT: vmv1r.v v0, v24
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v16, v25, a2
+; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: icmp_eq_vx_swap_nxv32i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT: vmv1r.v v24, v0
+; RV64-NEXT: csrr a3, vlenb
+; RV64-NEXT: srli a2, a3, 2
+; RV64-NEXT: slli a3, a3, 1
+; RV64-NEXT: vslidedown.vx v0, v0, a2
+; RV64-NEXT: sub a4, a1, a3
+; RV64-NEXT: sltu a5, a3, a1
+; RV64-NEXT: addi a5, a5, -1
+; RV64-NEXT: and a4, a5, a4
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v25, v16, a0, v0.t
+; RV64-NEXT: bltu a1, a3, .LBB191_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a1, a3
+; RV64-NEXT: .LBB191_2:
+; RV64-NEXT: vmv1r.v v0, v24
+; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT: vmseq.vx v16, v8, a0, v0.t
+; RV64-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v16, v25, a2
+; RV64-NEXT: vmv1r.v v0, v16
+; RV64-NEXT: ret
%elt.head = insertelement <vscale x 32 x i32> poison, i32 %b, i32 0
%vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> %vb, <vscale x 32 x i32> %va, metadata !"eq", <vscale x 32 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 6381887a1a2f9..3d34a619ce8bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -595,7 +595,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV32-NEXT: vmv1r.v v9, v0
; CHECK-RV32-NEXT: csrr a4, vlenb
; CHECK-RV32-NEXT: sub a2, a3, a4
-; CHECK-RV32-NEXT: sltu a5, a3, a2
+; CHECK-RV32-NEXT: sltu a5, a4, a3
; CHECK-RV32-NEXT: addi a5, a5, -1
; CHECK-RV32-NEXT: and a2, a5, a2
; CHECK-RV32-NEXT: bltu a3, a4, .LBB55_2
@@ -621,7 +621,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV64-NEXT: vmv1r.v v9, v0
; CHECK-RV64-NEXT: csrr a4, vlenb
; CHECK-RV64-NEXT: sub a3, a2, a4
-; CHECK-RV64-NEXT: sltu a5, a2, a3
+; CHECK-RV64-NEXT: sltu a5, a4, a2
; CHECK-RV64-NEXT: addi a5, a5, -1
; CHECK-RV64-NEXT: and a3, a5, a3
; CHECK-RV64-NEXT: bltu a2, a4, .LBB55_2
@@ -647,19 +647,19 @@ define <vscale x 16 x double> @strided_load_nxv16f64(ptr %ptr, i64 %stride, <vsc
define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 %stride, i32 zeroext %evl) {
; CHECK-RV32-LABEL: strided_load_nxv16f64_allones_mask:
; CHECK-RV32: # %bb.0:
-; CHECK-RV32-NEXT: csrr a4, vlenb
-; CHECK-RV32-NEXT: sub a2, a3, a4
-; CHECK-RV32-NEXT: sltu a5, a3, a2
+; CHECK-RV32-NEXT: csrr a2, vlenb
+; CHECK-RV32-NEXT: sub a4, a3, a2
+; CHECK-RV32-NEXT: sltu a5, a2, a3
; CHECK-RV32-NEXT: addi a5, a5, -1
-; CHECK-RV32-NEXT: and a2, a5, a2
-; CHECK-RV32-NEXT: bltu a3, a4, .LBB56_2
+; CHECK-RV32-NEXT: and a4, a5, a4
+; CHECK-RV32-NEXT: bltu a3, a2, .LBB56_2
; CHECK-RV32-NEXT: # %bb.1:
-; CHECK-RV32-NEXT: mv a3, a4
+; CHECK-RV32-NEXT: mv a3, a2
; CHECK-RV32-NEXT: .LBB56_2:
-; CHECK-RV32-NEXT: mul a4, a3, a1
-; CHECK-RV32-NEXT: add a4, a0, a4
-; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-RV32-NEXT: vlse64.v v16, (a4), a1
+; CHECK-RV32-NEXT: mul a2, a3, a1
+; CHECK-RV32-NEXT: add a2, a0, a2
+; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT: vlse64.v v16, (a2), a1
; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1
; CHECK-RV32-NEXT: ret
@@ -668,7 +668,7 @@ define <vscale x 16 x double> @strided_load_nxv16f64_allones_mask(ptr %ptr, i64
; CHECK-RV64: # %bb.0:
; CHECK-RV64-NEXT: csrr a4, vlenb
; CHECK-RV64-NEXT: sub a3, a2, a4
-; CHECK-RV64-NEXT: sltu a5, a2, a3
+; CHECK-RV64-NEXT: sltu a5, a4, a2
; CHECK-RV64-NEXT: addi a5, a5, -1
; CHECK-RV64-NEXT: and a3, a5, a3
; CHECK-RV64-NEXT: bltu a2, a4, .LBB56_2
@@ -703,7 +703,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV32-NEXT: mv a6, a7
; CHECK-RV32-NEXT: .LBB57_2:
; CHECK-RV32-NEXT: sub a5, a6, a2
-; CHECK-RV32-NEXT: sltu t0, a6, a5
+; CHECK-RV32-NEXT: sltu t0, a2, a6
; CHECK-RV32-NEXT: addi t0, t0, -1
; CHECK-RV32-NEXT: and t0, t0, a5
; CHECK-RV32-NEXT: mv a5, a6
@@ -713,15 +713,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV32-NEXT: .LBB57_4:
; CHECK-RV32-NEXT: mul t1, a5, a1
; CHECK-RV32-NEXT: srli t2, a2, 3
-; CHECK-RV32-NEXT: sub a7, a3, a7
; CHECK-RV32-NEXT: vsetvli t3, zero, e8, mf4, ta, ma
; CHECK-RV32-NEXT: vslidedown.vx v0, v8, t2
+; CHECK-RV32-NEXT: sub t2, a3, a7
; CHECK-RV32-NEXT: add t1, a0, t1
; CHECK-RV32-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV32-NEXT: sltu a3, a3, a7
+; CHECK-RV32-NEXT: sltu a3, a7, a3
; CHECK-RV32-NEXT: addi a3, a3, -1
-; CHECK-RV32-NEXT: and a3, a3, a7
+; CHECK-RV32-NEXT: and a3, a3, t2
; CHECK-RV32-NEXT: bltu a3, a2, .LBB57_6
; CHECK-RV32-NEXT: # %bb.5:
; CHECK-RV32-NEXT: mv a3, a2
@@ -751,7 +751,7 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV64-NEXT: mv a6, a7
; CHECK-RV64-NEXT: .LBB57_2:
; CHECK-RV64-NEXT: sub a5, a6, a4
-; CHECK-RV64-NEXT: sltu t0, a6, a5
+; CHECK-RV64-NEXT: sltu t0, a4, a6
; CHECK-RV64-NEXT: addi t0, t0, -1
; CHECK-RV64-NEXT: and t0, t0, a5
; CHECK-RV64-NEXT: mv a5, a6
@@ -761,15 +761,15 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
; CHECK-RV64-NEXT: .LBB57_4:
; CHECK-RV64-NEXT: mul t1, a5, a1
; CHECK-RV64-NEXT: srli t2, a4, 3
-; CHECK-RV64-NEXT: sub a7, a2, a7
; CHECK-RV64-NEXT: vsetvli t3, zero, e8, mf4, ta, ma
; CHECK-RV64-NEXT: vslidedown.vx v0, v8, t2
+; CHECK-RV64-NEXT: sub t2, a2, a7
; CHECK-RV64-NEXT: add t1, a0, t1
; CHECK-RV64-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-RV64-NEXT: vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV64-NEXT: sltu a2, a2, a7
+; CHECK-RV64-NEXT: sltu a2, a7, a2
; CHECK-RV64-NEXT: addi a2, a2, -1
-; CHECK-RV64-NEXT: and a2, a2, a7
+; CHECK-RV64-NEXT: and a2, a2, t2
; CHECK-RV64-NEXT: bltu a2, a4, .LBB57_6
; CHECK-RV64-NEXT: # %bb.5:
; CHECK-RV64-NEXT: mv a2, a4
@@ -861,10 +861,10 @@ define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, p
; CHECK-RV32: # %bb.0:
; CHECK-RV32-NEXT: csrr a1, vlenb
; CHECK-RV32-NEXT: srli a2, a1, 3
-; CHECK-RV32-NEXT: sub a1, a2, a1
-; CHECK-RV32-NEXT: sltu a3, a2, a1
-; CHECK-RV32-NEXT: addi a3, a3, -1
-; CHECK-RV32-NEXT: and a1, a3, a1
+; CHECK-RV32-NEXT: sub a3, a2, a1
+; CHECK-RV32-NEXT: sltu a1, a1, a2
+; CHECK-RV32-NEXT: addi a1, a1, -1
+; CHECK-RV32-NEXT: and a1, a1, a3
; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-RV32-NEXT: vlse64.v v24, (a0), zero
; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 2ec89888af077..12ff5e98c00e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -492,12 +492,12 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t
; CHECK-NEXT: sub a5, a2, a3
+; CHECK-NEXT: sltu a2, a3, a2
; CHECK-NEXT: mul a4, a4, a1
; CHECK-NEXT: srli a3, a3, 3
-; CHECK-NEXT: sltu a2, a2, a5
+; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a5
; CHECK-NEXT: add a0, a0, a4
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -508,25 +508,45 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
}
define void @strided_store_nxv16f64_allones_mask(<vscale x 16 x double> %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
-; CHECK-LABEL: strided_store_nxv16f64_allones_mask:
-; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB47_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a3, a4
-; CHECK-NEXT: .LBB47_2:
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vsse64.v v8, (a0), a1
-; CHECK-NEXT: sub a4, a2, a4
-; CHECK-NEXT: mul a3, a3, a1
-; CHECK-NEXT: sltu a2, a2, a4
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a4
-; CHECK-NEXT: add a0, a0, a3
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vsse64.v v16, (a0), a1
-; CHECK-NEXT: ret
+; CHECK-RV32-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: csrr a3, vlenb
+; CHECK-RV32-NEXT: mv a4, a2
+; CHECK-RV32-NEXT: bltu a2, a3, .LBB47_2
+; CHECK-RV32-NEXT: # %bb.1:
+; CHECK-RV32-NEXT: mv a4, a3
+; CHECK-RV32-NEXT: .LBB47_2:
+; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1
+; CHECK-RV32-NEXT: sub a5, a2, a3
+; CHECK-RV32-NEXT: sltu a2, a3, a2
+; CHECK-RV32-NEXT: mul a3, a4, a1
+; CHECK-RV32-NEXT: addi a2, a2, -1
+; CHECK-RV32-NEXT: and a2, a2, a5
+; CHECK-RV32-NEXT: add a0, a0, a3
+; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: strided_store_nxv16f64_allones_mask:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: csrr a4, vlenb
+; CHECK-RV64-NEXT: mv a3, a2
+; CHECK-RV64-NEXT: bltu a2, a4, .LBB47_2
+; CHECK-RV64-NEXT: # %bb.1:
+; CHECK-RV64-NEXT: mv a3, a4
+; CHECK-RV64-NEXT: .LBB47_2:
+; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1
+; CHECK-RV64-NEXT: sub a5, a2, a4
+; CHECK-RV64-NEXT: sltu a2, a4, a2
+; CHECK-RV64-NEXT: mul a3, a3, a1
+; CHECK-RV64-NEXT: addi a2, a2, -1
+; CHECK-RV64-NEXT: and a2, a2, a5
+; CHECK-RV64-NEXT: add a0, a0, a3
+; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1
+; CHECK-RV64-NEXT: ret
call void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32(<vscale x 16 x double> %v, ptr %ptr, i32 %stride, <vscale x 16 x i1> splat (i1 true), i32 %evl)
ret void
}
@@ -554,19 +574,19 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a1), a2, v0.t
; CHECK-NEXT: sub a0, a5, a4
-; CHECK-NEXT: mul a7, a7, a2
-; CHECK-NEXT: srli t0, a4, 3
-; CHECK-NEXT: sub a6, a3, a6
+; CHECK-NEXT: sub t0, a3, a6
+; CHECK-NEXT: sltu a3, a6, a3
+; CHECK-NEXT: srli a6, a4, 3
; CHECK-NEXT: vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v7, t0
-; CHECK-NEXT: sltu t0, a5, a0
+; CHECK-NEXT: vslidedown.vx v0, v7, a6
+; CHECK-NEXT: sltu a6, a4, a5
+; CHECK-NEXT: mul a7, a7, a2
+; CHECK-NEXT: addi a6, a6, -1
; CHECK-NEXT: add a7, a1, a7
-; CHECK-NEXT: sltu a3, a3, a6
-; CHECK-NEXT: addi t0, t0, -1
; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and t0, t0, a0
-; CHECK-NEXT: and a0, a3, a6
-; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT: and a6, a6, a0
+; CHECK-NEXT: and a0, a3, t0
+; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v16, (a7), a2, v0.t
; CHECK-NEXT: bltu a0, a4, .LBB48_6
; CHECK-NEXT: # %bb.5:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index c64b755051898..6378135654ed1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -551,7 +551,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -1306,7 +1306,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1329,7 +1329,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1354,11 +1354,11 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
; RV32-NEXT: srli a1, a0, 2
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; RV32-NEXT: vadd.vi v8, v8, -1, v0.t
-; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a2, a0, 1
+; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: sltu a0, a0, a1
+; RV32-NEXT: sub a1, a0, a2
+; RV32-NEXT: sltu a0, a2, a0
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a1
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
@@ -1374,7 +1374,7 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
; RV64-NEXT: slli a1, a0, 1
; RV64-NEXT: vslidedown.vx v0, v0, a2
; RV64-NEXT: sub a2, a0, a1
-; RV64-NEXT: sltu a3, a0, a2
+; RV64-NEXT: sltu a3, a1, a0
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: and a2, a3, a2
; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index e0fcd4009ad2e..7d97f353a22b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -847,7 +847,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -869,7 +869,7 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index d03b068e11ea8..42b1da9d97f2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -928,13 +928,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; ZVFH-NEXT: slli a1, a2, 1
; ZVFH-NEXT: srli a2, a2, 2
; ZVFH-NEXT: sub a3, a0, a1
+; ZVFH-NEXT: sltu a4, a1, a0
; ZVFH-NEXT: vslidedown.vx v0, v0, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: addi a3, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: addi a4, a4, -1
+; ZVFH-NEXT: and a3, a4, a3
+; ZVFH-NEXT: addi a2, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -977,13 +977,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1026,13 +1026,13 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: sltu a4, a1, a0
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1079,14 +1079,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFH-NEXT: slli a1, a2, 1
; ZVFH-NEXT: srli a2, a2, 2
; ZVFH-NEXT: sub a3, a0, a1
-; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT: sltu a4, a1, a0
+; ZVFH-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFH-NEXT: vslidedown.vx v0, v24, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: addi a3, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT: addi a4, a4, -1
+; ZVFH-NEXT: and a3, a4, a3
+; ZVFH-NEXT: addi a2, sp, 16
+; ZVFH-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1128,14 +1128,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1177,14 +1177,14 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1216,130 +1216,6 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
}
define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: .cfi_def_cfa_offset 16
-; ZVFH-NEXT: csrr a1, vlenb
-; ZVFH-NEXT: slli a1, a1, 4
-; ZVFH-NEXT: sub sp, sp, a1
-; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFH-NEXT: vmv1r.v v7, v0
-; ZVFH-NEXT: fmv.x.h a1, fa0
-; ZVFH-NEXT: csrr a2, vlenb
-; ZVFH-NEXT: vmv.v.x v24, a1
-; ZVFH-NEXT: slli a1, a2, 1
-; ZVFH-NEXT: srli a2, a2, 2
-; ZVFH-NEXT: sub a3, a0, a1
-; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT: vslidedown.vx v0, v0, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: csrr a3, vlenb
-; ZVFH-NEXT: slli a3, a3, 3
-; ZVFH-NEXT: add a3, sp, a3
-; ZVFH-NEXT: addi a3, a3, 16
-; ZVFH-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v24, v16, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT: bltu a0, a1, .LBB24_2
-; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: mv a0, a1
-; ZVFH-NEXT: .LBB24_2:
-; ZVFH-NEXT: vmv1r.v v0, v7
-; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add a0, sp, a0
-; ZVFH-NEXT: addi a0, a0, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 4
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: .cfi_def_cfa sp, 16
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: .cfi_def_cfa_offset 0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB24_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB24_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -1355,14 +1231,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8alt, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1397,108 +1273,6 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: addi sp, sp, -16
-; ZVFH-NEXT: .cfi_def_cfa_offset 16
-; ZVFH-NEXT: csrr a1, vlenb
-; ZVFH-NEXT: slli a1, a1, 3
-; ZVFH-NEXT: sub sp, sp, a1
-; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT: fmv.x.h a1, fa0
-; ZVFH-NEXT: csrr a2, vlenb
-; ZVFH-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFH-NEXT: vmset.m v24
-; ZVFH-NEXT: vmv.v.x v16, a1
-; ZVFH-NEXT: slli a1, a2, 1
-; ZVFH-NEXT: srli a2, a2, 2
-; ZVFH-NEXT: sub a3, a0, a1
-; ZVFH-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFH-NEXT: vslidedown.vx v0, v24, a2
-; ZVFH-NEXT: sltu a2, a0, a3
-; ZVFH-NEXT: addi a2, a2, -1
-; ZVFH-NEXT: and a2, a2, a3
-; ZVFH-NEXT: addi a3, sp, 16
-; ZVFH-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFH-NEXT: bltu a0, a1, .LBB25_2
-; ZVFH-NEXT: # %bb.1:
-; ZVFH-NEXT: mv a0, a1
-; ZVFH-NEXT: .LBB25_2:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v8
-; ZVFH-NEXT: addi a0, sp, 16
-; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT: vfadd.vv v16, v16, v24
-; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v16
-; ZVFH-NEXT: csrr a0, vlenb
-; ZVFH-NEXT: slli a0, a0, 3
-; ZVFH-NEXT: add sp, sp, a0
-; ZVFH-NEXT: .cfi_def_cfa sp, 16
-; ZVFH-NEXT: addi sp, sp, 16
-; ZVFH-NEXT: .cfi_def_cfa_offset 0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB25_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB25_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -1514,14 +1288,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8alt, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8alt, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16alt, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2351,13 +2125,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2400,13 +2174,13 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
+; ZVFBFA-NEXT: sltu a4, a1, a0
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2459,14 +2233,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2508,14 +2282,14 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2553,68 +2327,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vfadd.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB50_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB50_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -2631,17 +2343,17 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v0, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: csrr a3, vlenb
-; ZVFBFA-NEXT: slli a3, a3, 3
-; ZVFBFA-NEXT: add a3, sp, a3
-; ZVFBFA-NEXT: addi a3, a3, 16
-; ZVFBFA-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: csrr a2, vlenb
+; ZVFBFA-NEXT: slli a2, a2, 3
+; ZVFBFA-NEXT: add a2, sp, a2
+; ZVFBFA-NEXT: addi a2, a2, 16
+; ZVFBFA-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -2689,57 +2401,6 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vfadd.vf v8, v8, fa0
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfadd_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB51_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB51_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfadd.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
-;
; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
; ZVFBFA: # %bb.0:
; ZVFBFA-NEXT: addi sp, sp, -16
@@ -2756,14 +2417,14 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFBFA-NEXT: slli a1, a2, 1
; ZVFBFA-NEXT: srli a2, a2, 2
; ZVFBFA-NEXT: sub a3, a0, a1
-; ZVFBFA-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT: sltu a4, a1, a0
+; ZVFBFA-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFBFA-NEXT: vslidedown.vx v0, v24, a2
-; ZVFBFA-NEXT: sltu a2, a0, a3
-; ZVFBFA-NEXT: addi a2, a2, -1
-; ZVFBFA-NEXT: and a2, a2, a3
-; ZVFBFA-NEXT: addi a3, sp, 16
-; ZVFBFA-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT: addi a4, a4, -1
+; ZVFBFA-NEXT: and a3, a4, a3
+; ZVFBFA-NEXT: addi a2, sp, 16
+; ZVFBFA-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFBFA-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFBFA-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFBFA-NEXT: vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index e9d7137919ac9..5f8603067d82a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
}
define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB22_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB22_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfdiv_vf_nxv32bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmset.m v24
-; CHECK-NEXT: vmv.v.x v16, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB23_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB23_2:
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfdiv.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB46_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfdiv.vf v8, v8, fa0
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB47_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index c25a0d47c5c53..03cbe8c5d555c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -600,16 +600,16 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
; CHECK-NEXT: slli a0, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a1, a0
+; CHECK-NEXT: sltu a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a1, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -716,17 +716,17 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
; CHECK-NEXT: slli a0, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a1, a0
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a0, a1
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a1, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -815,7 +815,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
; CHECK-NEXT: srli a3, a3, 2
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
-; CHECK-NEXT: sltu a3, a0, a4
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: csrr a4, vlenb
@@ -912,124 +912,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
}
define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_commute:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v3, v0
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmv.v.x v8, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: slli a2, a2, 1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v4, v8, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB33_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB33_2:
-; CHECK-NEXT: vmv1r.v v0, v3
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT: vmv4r.v v12, v4
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -1058,7 +940,7 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
; CHECK-NEXT: sub a4, a0, a1
; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v8, a3
-; CHECK-NEXT: sltu a3, a0, a4
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a4
; CHECK-NEXT: csrr a4, vlenb
@@ -1161,107 +1043,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
}
define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x bfloat> %vc, i32 zeroext %evl) {
-; CHECK-LABEL: vfma_vf_nxv32bf16_unmasked_commute:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 5
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: add a1, a1, a2
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmset.m v8
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v8, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 4
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: slli a2, a2, 1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a2, sp, a2
-; CHECK-NEXT: addi a2, a2, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB35_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB35_2:
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: mv a1, a0
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfmadd.vv v0, v24, v8
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v16, v0
-; CHECK-NEXT: vmv8r.v v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 5
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fma.nxv32bf16(<vscale x 32 x bfloat> %vb, <vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -2049,16 +1830,16 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a0, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a1, a0
+; ZVFHMIN-NEXT: sltu a4, a0, a1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a1, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2172,17 +1953,17 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: slli a0, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a1, a0
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a0, a1
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a1, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
@@ -2277,7 +2058,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -2378,153 +2159,34 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmadd.vf v8, fa0, v16, v0.t
+; ZVFH-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_commute:
+; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: slli a1, a1, 2
; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB69_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB69_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfmadd.vf v8, fa0, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmv8r.v v24, v8
-; ZVFHMIN-NEXT: fmv.x.h a2, fa0
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT: vmv8r.v v24, v8
+; ZVFHMIN-NEXT: fmv.x.h a2, fa0
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: vmset.m v8
; ZVFHMIN-NEXT: slli a1, a3, 1
@@ -2532,7 +2194,7 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -2640,108 +2302,6 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfma_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB71_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB71_2:
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -3428,14 +2988,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: slli a5, a1, 3
; CHECK-NEXT: sub a6, a4, a1
; CHECK-NEXT: add a7, a2, a5
-; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: vl8re64.v v8, (a7)
; CHECK-NEXT: csrr a7, vlenb
; CHECK-NEXT: slli a7, a7, 3
; CHECK-NEXT: add a7, sp, a7
; CHECK-NEXT: addi a7, a7, 16
; CHECK-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: sltu a7, a4, a6
+; CHECK-NEXT: sltu a7, a1, a4
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: vl8re64.v v8, (a5)
; CHECK-NEXT: csrr a5, vlenb
@@ -3563,7 +3123,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a4, a5
+; CHECK-NEXT: sltu a3, a1, a4
; CHECK-NEXT: vl8re64.v v8, (a2)
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
@@ -7976,35 +7536,36 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv8r.v v16, v8
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT: vmv8r.v v24, v8
+; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
; ZVFHMIN-NEXT: lui a2, 8
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a0, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: sub a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT: sltu a3, a0, a1
; ZVFHMIN-NEXT: addi a3, a3, -1
+; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vxor.vx v16, v8, a2, v0.t
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: vmv1r.v v0, v6
+; ZVFHMIN-NEXT: vmv4r.v v8, v16
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -8013,37 +7574,37 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: csrr a2, vlenb
+; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: add a2, sp, a2
+; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a1, a0, .LBB280_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
; ZVFHMIN-NEXT: .LBB280_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
@@ -8052,17 +7613,17 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vmv.v.v v16, v8
; ZVFHMIN-NEXT: vmv4r.v v12, v4
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
@@ -8114,10 +7675,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: sub a4, a1, a0
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
+; ZVFHMIN-NEXT: sltu a3, a0, a1
+; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: addi a2, sp, 16
@@ -8229,7 +7790,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -8338,128 +7899,6 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmsub.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB283_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB283_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
@@ -8498,7 +7937,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -8606,9 +8045,75 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmsub.vf v8, fa0, v16
+; ZVFH-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT: vmv.v.v v8, v16
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfmsub_vf_nxv32f16_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
@@ -8616,30 +8121,31 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv1r.v v3, v0
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: mv a4, a1
; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, a1, a4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
+; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -8654,7 +8160,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -8662,27 +8168,39 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB285_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB290_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB285_2:
+; ZVFHMIN-NEXT: .LBB290_2:
+; ZVFHMIN-NEXT: vmv1r.v v0, v3
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -8690,1046 +8208,21 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT: vmv.v.v v8, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 4
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB286_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB286_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_commuted:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB287_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB287_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB288_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB288_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vv_nxv32f16_unmasked_commuted:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB289_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB289_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a4, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB290_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB290_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a4, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB291_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 2
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a2, fa0
-; ZVFHMIN-NEXT: lui a1, 8
-; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v7
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v24, v8, a1
-; ZVFHMIN-NEXT: vxor.vx v8, v16, a1
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a0, a1
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: mv a5, a4
-; ZVFHMIN-NEXT: slli a4, a4, 1
-; ZVFHMIN-NEXT: add a4, a4, a5
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 4
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 5
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB292_2:
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 2
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a2, fa0
-; ZVFHMIN-NEXT: lui a1, 8
-; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a0, a1
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 4
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a4, vlenb
-; ZVFHMIN-NEXT: slli a4, a4, 3
-; ZVFHMIN-NEXT: mv a5, a4
-; ZVFHMIN-NEXT: slli a4, a4, 1
-; ZVFHMIN-NEXT: add a4, a4, a5
-; ZVFHMIN-NEXT: add a4, sp, a4
-; ZVFHMIN-NEXT: addi a4, a4, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a2
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB293_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB293_2:
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vmv.v.v v16, v8
+; ZVFHMIN-NEXT: vmv4r.v v12, v4
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -9739,20 +8232,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_commute:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
@@ -9760,30 +8253,31 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv1r.v v3, v0
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
+; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: mv a4, a1
; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, a1, a4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -9814,10 +8308,10 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB294_2:
+; ZVFHMIN-NEXT: .LBB291_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
@@ -9870,249 +8364,134 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
- %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %v
-}
-
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a3, 1
-; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB295_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB295_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v3
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v16, v8
-; ZVFHMIN-NEXT: vmv4r.v v12, v4
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
- %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
- %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 5
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 3
; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: slli a1, a1, 2
; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT: fmv.x.h a2, fa0
+; ZVFHMIN-NEXT: lui a1, 8
; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
; ZVFHMIN-NEXT: vmset.m v7
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT: vxor.vx v24, v8, a1
+; ZVFHMIN-NEXT: vxor.vx v8, v16, a1
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sub a4, a0, a1
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 4
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 5
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v8, a2
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
+; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB296_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB296_2:
+; ZVFHMIN-NEXT: .LBB292_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 5
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 2
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
; ZVFHMIN-NEXT: addi sp, sp, 16
@@ -10120,20 +8499,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negva, <vscale x 32 x half> %vb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_unmasked_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
@@ -10141,80 +8520,74 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: slli a1, a1, 5
; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: mv a2, a1
-; ZVFHMIN-NEXT: slli a1, a1, 1
-; ZVFHMIN-NEXT: add a1, a1, a2
-; ZVFHMIN-NEXT: add a1, sp, a1
-; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: lui a2, 8
+; ZVFHMIN-NEXT: fmv.x.h a2, fa0
+; ZVFHMIN-NEXT: lui a1, 8
; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v7
+; ZVFHMIN-NEXT: vmset.m v24
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT: vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a1
; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sub a4, a0, a1
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 4
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 4
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a4, vlenb
+; ZVFHMIN-NEXT: slli a4, a4, 3
+; ZVFHMIN-NEXT: mv a5, a4
+; ZVFHMIN-NEXT: slli a4, a4, 1
+; ZVFHMIN-NEXT: add a4, a4, a5
+; ZVFHMIN-NEXT: add a4, sp, a4
+; ZVFHMIN-NEXT: addi a4, a4, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v8, a2
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB297_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB293_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB297_2:
+; ZVFHMIN-NEXT: .LBB293_2:
; ZVFHMIN-NEXT: csrr a1, vlenb
; ZVFHMIN-NEXT: slli a1, a1, 4
; ZVFHMIN-NEXT: add a1, sp, a1
; ZVFHMIN-NEXT: addi a1, a1, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -10222,15 +8595,20 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
+; ZVFHMIN-NEXT: vfmadd.vv v16, v0, v24
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
-; ZVFHMIN-NEXT: vmv8r.v v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10240,79 +8618,66 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
- %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negva = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x half> %negva, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
-; ZVFH-NEXT: vmv.v.v v8, v16
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
+; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: csrr a5, vlenb
-; ZVFHMIN-NEXT: slli a5, a5, 4
-; ZVFHMIN-NEXT: add a5, sp, a5
-; ZVFHMIN-NEXT: addi a5, a5, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v24, v24, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10320,30 +8685,39 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB298_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB298_2:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB294_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: mv a1, a0
+; ZVFHMIN-NEXT: slli a0, a0, 1
+; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
@@ -10353,27 +8727,19 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: slli a0, a0, 1
-; ZVFHMIN-NEXT: add a0, a0, a1
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT: vmv.v.v v16, v8
; ZVFHMIN-NEXT: vmv4r.v v12, v4
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24, v0.t
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10381,68 +8747,68 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16, v0.t
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv1r.v v3, v0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v8, (a0)
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
+; ZVFHMIN-NEXT: vxor.vx v8, v24, a2, v0.t
+; ZVFHMIN-NEXT: vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
-; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 3
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10450,38 +8816,38 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB299_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB295_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB299_2:
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB295_2:
; ZVFHMIN-NEXT: vmv1r.v v0, v3
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT: addi a0, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
@@ -10490,15 +8856,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
+; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t
; ZVFHMIN-NEXT: vmv.v.v v16, v8
@@ -10512,68 +8878,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> %m, i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
+; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT: vmset.m v7
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 4
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10589,16 +8956,16 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB300_2
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB296_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB300_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB296_2:
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
@@ -10607,7 +8974,7 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -10615,14 +8982,14 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10630,68 +8997,69 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
- %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
- %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negvb, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
ret <vscale x 32 x half> %v
}
-define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
-; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vscale x 32 x half> %va, half %b, <vscale x 32 x half> %vc, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
; ZVFH: # %bb.0:
-; ZVFH-NEXT: vl8re16.v v24, (a0)
-; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vf v8, fa0, v16
; ZVFH-NEXT: ret
;
-; ZVFHMIN-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFHMIN-LABEL: vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute:
; ZVFHMIN: # %bb.0:
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 5
-; ZVFHMIN-NEXT: sub sp, sp, a2
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 5
+; ZVFHMIN-NEXT: sub sp, sp, a1
; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 3
-; ZVFHMIN-NEXT: mv a3, a2
-; ZVFHMIN-NEXT: slli a2, a2, 1
-; ZVFHMIN-NEXT: add a2, a2, a3
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 3
+; ZVFHMIN-NEXT: mv a2, a1
+; ZVFHMIN-NEXT: slli a1, a1, 1
+; ZVFHMIN-NEXT: add a1, a1, a2
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: fmv.x.h a1, fa0
; ZVFHMIN-NEXT: lui a2, 8
-; ZVFHMIN-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT: vmset.m v8
+; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT: vmset.m v7
; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT: vmv.v.x v24, a1
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT: slli a0, a3, 1
+; ZVFHMIN-NEXT: slli a1, a3, 1
; ZVFHMIN-NEXT: srli a3, a3, 2
-; ZVFHMIN-NEXT: sub a4, a1, a0
-; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a1, a4
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m8, ta, ma
; ZVFHMIN-NEXT: vxor.vx v8, v24, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a3, a3, a4
+; ZVFHMIN-NEXT: sub a2, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
+; ZVFHMIN-NEXT: csrr a3, vlenb
+; ZVFHMIN-NEXT: slli a3, a3, 4
+; ZVFHMIN-NEXT: add a3, sp, a3
+; ZVFHMIN-NEXT: addi a3, a3, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: slli a2, a2, 4
-; ZVFHMIN-NEXT: add a2, sp, a2
-; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
; ZVFHMIN-NEXT: mv a3, a2
@@ -10699,33 +9067,33 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: add a2, a2, a3
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a1, a0, .LBB301_2
+; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t
+; ZVFHMIN-NEXT: bltu a0, a1, .LBB297_2
; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a1, a0
-; ZVFHMIN-NEXT: .LBB301_2:
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: mv a0, a1
+; ZVFHMIN-NEXT: .LBB297_2:
+; ZVFHMIN-NEXT: csrr a1, vlenb
+; ZVFHMIN-NEXT: slli a1, a1, 4
+; ZVFHMIN-NEXT: add a1, sp, a1
+; ZVFHMIN-NEXT: addi a1, a1, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: addi a0, sp, 16
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: mv a1, a0
@@ -10733,14 +9101,15 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: add a0, a0, a1
; ZVFHMIN-NEXT: add a0, sp, a0
; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT: vmv8r.v v8, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 5
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -10748,6 +9117,61 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
; ZVFHMIN-NEXT: addi sp, sp, 16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
; ZVFHMIN-NEXT: ret
+ %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
+ %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
+ %negvb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negvc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %vc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negvb, <vscale x 32 x half> %va, <vscale x 32 x half> %negvc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v16, v8, v24, v0.t
+; ZVFH-NEXT: vmv.v.v v8, v16
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24, v0.t
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> %m, i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> %m, i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
+ %negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ %v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %negb, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x half> %c, i32 zeroext %evl) {
+; ZVFH-LABEL: vfnmsub_vv_nxv32f16_unmasked_commuted:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vl8re16.v v24, (a0)
+; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; ZVFH-NEXT: vfnmadd.vv v8, v16, v24
+; ZVFH-NEXT: ret
%negb = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%negc = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %c, <vscale x 32 x i1> splat (i1 true), i32 %evl)
%v = call <vscale x 32 x half> @llvm.vp.fma.nxv32f16(<vscale x 32 x half> %negb, <vscale x 32 x half> %va, <vscale x 32 x half> %negc, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -10781,7 +9205,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -10912,7 +9336,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -11038,7 +9462,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -11168,7 +9592,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
; ZVFHMIN-NEXT: sub a4, a0, a1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a4
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a3, a3, a4
; ZVFHMIN-NEXT: csrr a4, vlenb
@@ -11296,11 +9720,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -11425,11 +9849,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
@@ -11560,11 +9984,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
@@ -11679,11 +10103,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
; ZVFHMIN-NEXT: srli a3, a3, 2
; ZVFHMIN-NEXT: vxor.vx v16, v16, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3
-; ZVFHMIN-NEXT: sltu a3, a0, a2
-; ZVFHMIN-NEXT: addi a3, a3, -1
-; ZVFHMIN-NEXT: and a2, a3, a2
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a2, a4, a2
; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 394887fee67fc..803680dd09061 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index 5c5542619b6ef..43b62bb7f9f76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -177,13 +177,13 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -230,14 +230,14 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -499,13 +499,13 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -558,14 +558,14 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index eb77b4b4dbac3..39f0163de048c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -489,13 +489,13 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -548,14 +548,14 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -592,68 +592,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmul.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB22_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB22_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -666,57 +604,6 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfmul.vf v8, v8, fa0
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfmul_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB23_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB23_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfmul.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 03de2c97e685c..37ee3ad000854 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1096,14 +1096,14 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
; CHECK-NEXT: slli a5, a1, 3
; CHECK-NEXT: sub a6, a4, a1
; CHECK-NEXT: add a7, a2, a5
-; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: vl8re64.v v8, (a7)
; CHECK-NEXT: csrr a7, vlenb
; CHECK-NEXT: slli a7, a7, 3
; CHECK-NEXT: add a7, sp, a7
; CHECK-NEXT: addi a7, a7, 16
; CHECK-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: sltu a7, a4, a6
+; CHECK-NEXT: sltu a7, a1, a4
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: vl8re64.v v8, (a5)
; CHECK-NEXT: csrr a5, vlenb
@@ -1217,7 +1217,7 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK-NEXT: sub a5, a4, a1
; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vl8re64.v v24, (a3)
-; CHECK-NEXT: sltu a3, a4, a5
+; CHECK-NEXT: sltu a3, a1, a4
; CHECK-NEXT: vl8re64.v v8, (a2)
; CHECK-NEXT: addi a2, sp, 16
; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index 96fbe3f6ff025..a78fea1ef3110 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -799,7 +799,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -821,7 +821,7 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64_unmasked(<vscale x 16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 458795db7965d..c759f2b48f53f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -93,7 +93,7 @@ define <vscale x 32 x float> @vfpext_nxv32f16_nxv32f32(<vscale x 32 x half> %a,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 7127d10e67dbc..5a0e0e8004af8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 07b58ed057508..03c5f7eed3fc0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -469,7 +469,7 @@ define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -497,7 +497,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -520,7 +520,7 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 4177672b3a306..0f78e035e39d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -92,7 +92,7 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -135,11 +135,11 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: slli a3, a1, 1
; CHECK-NEXT: add a6, a0, a4
; CHECK-NEXT: sub a0, a2, a3
-; CHECK-NEXT: sltu a4, a2, a0
+; CHECK-NEXT: sltu a4, a3, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a0, a4, a0
; CHECK-NEXT: sub a4, a0, a1
-; CHECK-NEXT: sltu a7, a0, a4
+; CHECK-NEXT: sltu a7, a1, a0
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: and a4, a7, a4
; CHECK-NEXT: srli a7, a1, 2
@@ -162,7 +162,7 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
; CHECK-NEXT: mv a2, a3
; CHECK-NEXT: .LBB8_4:
; CHECK-NEXT: sub a0, a2, a1
-; CHECK-NEXT: sltu a3, a2, a0
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a0, a3, a0
; CHECK-NEXT: vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 451b13edb794e..a77b8a6905f71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -161,7 +161,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: and a3, a4, a3
@@ -196,7 +196,7 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: sltu a4, a0, a3
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v16, a2
@@ -437,7 +437,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: sltu a4, a0, a3
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: and a3, a4, a3
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: sltu a4, a0, a3
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: addi a4, a4, -1
; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v16, a2
@@ -715,7 +715,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <v
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -737,7 +737,7 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64_unmasked(<vscale x 16 x double
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 6637aced3cdac..ce30d9257cb02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -367,13 +367,13 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
+; CHECK-NEXT: sltu a4, a1, a0
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -420,14 +420,14 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT: sltu a4, a1, a0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -459,67 +459,6 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
}
define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v7, v0
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmv.v.x v24, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: csrr a3, vlenb
-; CHECK-NEXT: slli a3, a3, 3
-; CHECK-NEXT: add a3, sp, a3
-; CHECK-NEXT: addi a3, a3, 16
-; CHECK-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB22_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB22_2:
-; CHECK-NEXT: vmv1r.v v0, v7
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -527,56 +466,6 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
}
define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfsub_vf_nxv32bf16_unmasked:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: fmv.x.h a1, fa0
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmset.m v24
-; CHECK-NEXT: vmv.v.x v16, a1
-; CHECK-NEXT: slli a1, a2, 1
-; CHECK-NEXT: srli a2, a2, 2
-; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v24, a2
-; CHECK-NEXT: sltu a2, a0, a3
-; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v16, v24, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT: bltu a0, a1, .LBB23_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB23_2:
-; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfsub.vv v16, v16, v24
-; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x bfloat> @llvm.vp.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -1064,13 +953,13 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
+; ZVFHMIN-NEXT: sltu a4, a1, a0
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1123,14 +1012,14 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFHMIN-NEXT: slli a1, a2, 1
; ZVFHMIN-NEXT: srli a2, a2, 2
; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT: sltu a4, a1, a0
+; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT: addi a4, a4, -1
+; ZVFHMIN-NEXT: and a3, a4, a3
+; ZVFHMIN-NEXT: addi a2, sp, 16
+; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
@@ -1167,68 +1056,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfsub.vf v8, v8, fa0, v0.t
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 4
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv1r.v v7, v0
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vmv.v.x v24, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: csrr a3, vlenb
-; ZVFHMIN-NEXT: slli a3, a3, 3
-; ZVFHMIN-NEXT: add a3, sp, a3
-; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB46_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB46_2:
-; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add a0, sp, a0
-; ZVFHMIN-NEXT: addi a0, a0, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16, v0.t
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 4
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1241,57 +1068,6 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
; ZVFH-NEXT: vfsub.vf v8, v8, fa0
; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vfsub_vf_nxv32f16_unmasked:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a1, vlenb
-; ZVFHMIN-NEXT: slli a1, a1, 3
-; ZVFHMIN-NEXT: sub sp, sp, a1
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT: fmv.x.h a1, fa0
-; ZVFHMIN-NEXT: csrr a2, vlenb
-; ZVFHMIN-NEXT: vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmset.m v24
-; ZVFHMIN-NEXT: vmv.v.x v16, a1
-; ZVFHMIN-NEXT: slli a1, a2, 1
-; ZVFHMIN-NEXT: srli a2, a2, 2
-; ZVFHMIN-NEXT: sub a3, a0, a1
-; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT: sltu a2, a0, a3
-; ZVFHMIN-NEXT: addi a2, a2, -1
-; ZVFHMIN-NEXT: and a2, a2, a3
-; ZVFHMIN-NEXT: addi a3, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: bltu a0, a1, .LBB47_2
-; ZVFHMIN-NEXT: # %bb.1:
-; ZVFHMIN-NEXT: mv a0, a1
-; ZVFHMIN-NEXT: .LBB47_2:
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfsub.vv v16, v16, v24
-; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
-; ZVFHMIN-NEXT: ret
%elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
%v = call <vscale x 32 x half> @llvm.vp.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 3cf464247250a..df4b731015243 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmax_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmax.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index e755d099df4a8..9b5e83f94e5fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vmaxu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmaxu.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 961f63cbfbc95..1816b07c49c6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -396,7 +396,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <vs
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -421,7 +421,7 @@ define <vscale x 128 x i8> @vmin_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -935,7 +935,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -960,7 +960,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -987,11 +987,11 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmin.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1007,7 +1007,7 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 631799d24e14c..608790009bdb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -395,7 +395,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8(<vscale x 128 x i8> %va, i8 %b, <v
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -420,7 +420,7 @@ define <vscale x 128 x i8> @vminu_vx_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma
@@ -934,7 +934,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -959,7 +959,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
@@ -986,11 +986,11 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV32-NEXT: srli a2, a1, 2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vminu.vx v8, v8, a0, v0.t
-; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT: slli a3, a1, 1
+; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
-; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
-; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: sub a2, a1, a3
+; RV32-NEXT: sltu a1, a3, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
@@ -1006,7 +1006,7 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
; RV64-NEXT: slli a2, a1, 1
; RV64-NEXT: vslidedown.vx v0, v0, a3
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index c96a7d774a5d5..65d37bfb31916 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -700,17 +700,17 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
; CHECK-NEXT: addi a3, sp, 64
; CHECK-NEXT: li a4, -1
; CHECK-NEXT: sub a5, a0, a2
-; CHECK-NEXT: add a6, a0, a3
-; CHECK-NEXT: sltu a0, a0, a5
-; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: sltu a6, a2, a0
+; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: add a2, a3, a2
; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a5, a6, a5
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT: vsse8.v v8, (a6), a4
-; CHECK-NEXT: sub a6, a6, a1
-; CHECK-NEXT: and a0, a0, a5
-; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vsse8.v v16, (a6), a4
+; CHECK-NEXT: vsse8.v v8, (a0), a4
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: vsetvli zero, a5, e8, m8, ta, ma
+; CHECK-NEXT: vsse8.v v16, (a0), a4
; CHECK-NEXT: vle8.v v16, (a2)
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vle8.v v8, (a3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
index b8b2ba7c5e5d3..aeee1fa8215f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
@@ -727,7 +727,7 @@ define <vscale x 32 x i32> @vp_splat_nxv32i32(i32 %val, <vscale x 32 x i1> %m, i
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index b83ddce61f44d..3d025a29e6725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -470,61 +470,61 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
; CHECK-LABEL: test_vp_splice_nxv16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a5, a4, 1
+; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: slli a1, a4, 3
-; CHECK-NEXT: slli a7, a4, 1
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: add a5, a0, a1
-; CHECK-NEXT: mv a6, a2
-; CHECK-NEXT: bltu a2, a7, .LBB22_2
+; CHECK-NEXT: mv a7, a2
+; CHECK-NEXT: bltu a2, a5, .LBB22_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a6, a7
+; CHECK-NEXT: mv a7, a5
; CHECK-NEXT: .LBB22_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a7, vlenb
-; CHECK-NEXT: slli a7, a7, 5
-; CHECK-NEXT: sub sp, sp, a7
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 5
+; CHECK-NEXT: sub sp, sp, a5
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: vl8re64.v v24, (a5)
-; CHECK-NEXT: slli a5, a6, 3
+; CHECK-NEXT: add a5, a0, a1
+; CHECK-NEXT: slli a7, a7, 3
; CHECK-NEXT: addi a6, sp, 64
-; CHECK-NEXT: add a5, a6, a5
-; CHECK-NEXT: mv a7, a2
+; CHECK-NEXT: mv t0, a2
; CHECK-NEXT: bltu a2, a4, .LBB22_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv a7, a4
+; CHECK-NEXT: mv t0, a4
; CHECK-NEXT: .LBB22_4:
+; CHECK-NEXT: vl8re64.v v24, (a5)
+; CHECK-NEXT: add a5, a6, a7
; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a6)
; CHECK-NEXT: sub a0, a2, a4
-; CHECK-NEXT: add a6, a6, a1
-; CHECK-NEXT: sub a7, a3, a4
-; CHECK-NEXT: sltu a2, a2, a0
+; CHECK-NEXT: sltu a2, a4, a2
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a0
-; CHECK-NEXT: sltu a0, a3, a7
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, a7
-; CHECK-NEXT: add a7, a5, a1
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v16, (a6)
+; CHECK-NEXT: and a0, a2, a0
+; CHECK-NEXT: add a6, a6, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (a7)
+; CHECK-NEXT: vse64.v v16, (a6)
+; CHECK-NEXT: mv a0, a3
; CHECK-NEXT: bltu a3, a4, .LBB22_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a3, a4
+; CHECK-NEXT: mv a0, a4
; CHECK-NEXT: .LBB22_6:
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v0, (a5)
-; CHECK-NEXT: addi a2, sp, 104
-; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v0, (a5)
+; CHECK-NEXT: sub a2, a3, a4
+; CHECK-NEXT: sltu a3, a4, a3
+; CHECK-NEXT: add a5, a5, a1
+; CHECK-NEXT: addi a4, sp, 104
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: add a1, a4, a1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (a5)
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a4)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -537,66 +537,66 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 {
; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: slli a1, a4, 3
-; CHECK-NEXT: slli a7, a4, 1
-; CHECK-NEXT: addi a7, a7, -1
-; CHECK-NEXT: add a5, a0, a1
-; CHECK-NEXT: mv a6, a2
-; CHECK-NEXT: bltu a2, a7, .LBB23_2
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a6, a5, 1
+; CHECK-NEXT: addi a6, a6, -1
+; CHECK-NEXT: slli a1, a5, 3
+; CHECK-NEXT: mv a4, a2
+; CHECK-NEXT: bltu a2, a6, .LBB23_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a6, a7
+; CHECK-NEXT: mv a4, a6
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a7, vlenb
-; CHECK-NEXT: slli a7, a7, 5
-; CHECK-NEXT: sub sp, sp, a7
+; CHECK-NEXT: csrr a6, vlenb
+; CHECK-NEXT: slli a6, a6, 5
+; CHECK-NEXT: sub sp, sp, a6
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: vl8re64.v v24, (a5)
-; CHECK-NEXT: slli a5, a6, 3
+; CHECK-NEXT: add a6, a0, a1
+; CHECK-NEXT: slli a4, a4, 3
; CHECK-NEXT: addi a7, sp, 64
-; CHECK-NEXT: add a6, a7, a5
; CHECK-NEXT: mv t0, a2
-; CHECK-NEXT: bltu a2, a4, .LBB23_4
+; CHECK-NEXT: bltu a2, a5, .LBB23_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv t0, a4
+; CHECK-NEXT: mv t0, a5
; CHECK-NEXT: .LBB23_4:
+; CHECK-NEXT: vl8re64.v v24, (a6)
+; CHECK-NEXT: add a6, a7, a4
; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a7)
-; CHECK-NEXT: sub a0, a2, a4
-; CHECK-NEXT: add a7, a7, a1
-; CHECK-NEXT: sub t0, a3, a4
-; CHECK-NEXT: sltu a2, a2, a0
+; CHECK-NEXT: sub a0, a2, a5
+; CHECK-NEXT: sltu a2, a5, a2
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, a0
-; CHECK-NEXT: sltu a0, a3, t0
-; CHECK-NEXT: addi a0, a0, -1
-; CHECK-NEXT: and a0, a0, t0
-; CHECK-NEXT: add t0, a6, a1
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v16, (a7)
+; CHECK-NEXT: and a0, a2, a0
+; CHECK-NEXT: add a7, a7, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (t0)
-; CHECK-NEXT: bltu a3, a4, .LBB23_6
+; CHECK-NEXT: vse64.v v16, (a7)
+; CHECK-NEXT: mv a0, a3
+; CHECK-NEXT: bltu a3, a5, .LBB23_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a3, a4
+; CHECK-NEXT: mv a0, a5
; CHECK-NEXT: .LBB23_6:
-; CHECK-NEXT: li a2, 8
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v0, (a6)
-; CHECK-NEXT: bltu a5, a2, .LBB23_8
+; CHECK-NEXT: sub a2, a3, a5
+; CHECK-NEXT: sltu a3, a5, a3
+; CHECK-NEXT: add a5, a6, a1
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: li a3, 8
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (a5)
+; CHECK-NEXT: bltu a4, a3, .LBB23_8
; CHECK-NEXT: # %bb.7:
-; CHECK-NEXT: li a5, 8
+; CHECK-NEXT: li a4, 8
; CHECK-NEXT: .LBB23_8:
-; CHECK-NEXT: sub a2, a6, a5
+; CHECK-NEXT: sub a2, a6, a4
; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
-; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index a075bba81d3c6..fb8480ee5f471 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -254,7 +254,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
; RV32-NEXT: slli a2, a3, 1
; RV32-NEXT: srli a3, a3, 2
; RV32-NEXT: sub a4, a1, a2
-; RV32-NEXT: sltu a5, a1, a4
+; RV32-NEXT: sltu a5, a2, a1
; RV32-NEXT: addi a5, a5, -1
; RV32-NEXT: and a4, a5, a4
; RV32-NEXT: vslidedown.vx v0, v0, a3
@@ -281,12 +281,12 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
; RV64-NEXT: slli a3, a2, 1
; RV64-NEXT: srli a4, a2, 2
; RV64-NEXT: sub a5, a1, a3
+; RV64-NEXT: sltu a6, a3, a1
; RV64-NEXT: vslidedown.vx v13, v0, a4
-; RV64-NEXT: sltu a4, a1, a5
-; RV64-NEXT: addi a4, a4, -1
-; RV64-NEXT: and a5, a4, a5
+; RV64-NEXT: addi a6, a6, -1
+; RV64-NEXT: and a5, a6, a5
; RV64-NEXT: sub a4, a5, a2
-; RV64-NEXT: sltu a6, a5, a4
+; RV64-NEXT: sltu a6, a2, a5
; RV64-NEXT: addi a6, a6, -1
; RV64-NEXT: and a6, a6, a4
; RV64-NEXT: srli a4, a2, 3
@@ -310,7 +310,7 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
; RV64-NEXT: mv a1, a3
; RV64-NEXT: .LBB12_4:
; RV64-NEXT: sub a3, a1, a2
-; RV64-NEXT: sltu a5, a1, a3
+; RV64-NEXT: sltu a5, a2, a1
; RV64-NEXT: addi a5, a5, -1
; RV64-NEXT: and a3, a5, a3
; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
@@ -2367,7 +2367,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: srli a3, a1, 3
; RV32-NEXT: vslidedown.vx v0, v0, a3
-; RV32-NEXT: sltu a3, a0, a2
+; RV32-NEXT: sltu a3, a1, a0
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2390,7 +2390,7 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
; RV64-NEXT: sub a2, a0, a1
; RV64-NEXT: srli a3, a1, 3
; RV64-NEXT: vslidedown.vx v0, v0, a3
-; RV64-NEXT: sltu a3, a0, a2
+; RV64-NEXT: sltu a3, a1, a0
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: and a2, a3, a2
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
@@ -2422,8 +2422,8 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a1, a2
+; RV32-NEXT: sltu a1, a2, a1
; RV32-NEXT: srli a2, a2, 3
-; RV32-NEXT: sltu a1, a1, a3
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
@@ -2443,7 +2443,7 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
; RV64-NEXT: srli a4, a2, 3
; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a4
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
@@ -2479,8 +2479,8 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a1, a2
+; RV32-NEXT: sltu a1, a2, a1
; RV32-NEXT: srli a2, a2, 3
-; RV32-NEXT: sltu a1, a1, a3
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
@@ -2500,7 +2500,7 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
; RV64-NEXT: srli a4, a2, 3
; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a4
-; RV64-NEXT: sltu a4, a1, a3
+; RV64-NEXT: sltu a4, a2, a1
; RV64-NEXT: addi a4, a4, -1
; RV64-NEXT: and a3, a4, a3
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
@@ -2537,8 +2537,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a1, a2
+; RV32-NEXT: sltu a1, a2, a1
; RV32-NEXT: srli a2, a2, 3
-; RV32-NEXT: sltu a1, a1, a3
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a2
@@ -2561,8 +2561,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vluxei32.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a1, a2
+; RV64-NEXT: sltu a1, a2, a1
; RV64-NEXT: srli a2, a2, 3
-; RV64-NEXT: sltu a1, a1, a3
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index 2ece316c7e54a..4d2ba719d63ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -456,15 +456,15 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
; CHECK-NEXT: vmv1r.v v8, v0
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: slli a4, a2, 3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: srli a5, a2, 3
; CHECK-NEXT: vslidedown.vx v0, v0, a5
-; CHECK-NEXT: sltu a5, a1, a3
-; CHECK-NEXT: addi a5, a5, -1
-; CHECK-NEXT: and a3, a5, a3
-; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: slli a5, a2, 3
+; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: and a3, a4, a3
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v16, (a4), v0.t
+; CHECK-NEXT: vle64.v v16, (a5), v0.t
; CHECK-NEXT: bltu a1, a2, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a1, a2
@@ -496,18 +496,18 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
; CHECK-NEXT: mv a4, a5
; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: sub a6, a4, a3
-; CHECK-NEXT: slli a7, a3, 3
-; CHECK-NEXT: sltu t0, a4, a6
-; CHECK-NEXT: addi t0, t0, -1
-; CHECK-NEXT: and a6, t0, a6
-; CHECK-NEXT: srli t0, a3, 3
-; CHECK-NEXT: sub t1, a2, a5
-; CHECK-NEXT: add a5, a0, a7
-; CHECK-NEXT: sltu a2, a2, t1
+; CHECK-NEXT: sltu a7, a3, a4
+; CHECK-NEXT: sub t0, a2, a5
+; CHECK-NEXT: sltu a2, a5, a2
+; CHECK-NEXT: slli a5, a3, 3
+; CHECK-NEXT: addi a7, a7, -1
+; CHECK-NEXT: and a6, a7, a6
+; CHECK-NEXT: srli a7, a3, 3
+; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a2, a2, t1
-; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v8, t0
+; CHECK-NEXT: and a2, a2, t0
+; CHECK-NEXT: vsetvli t0, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a7
; CHECK-NEXT: bltu a2, a3, .LBB45_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a2, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index f92ee37051840..01edd0f912bd6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -198,22 +198,22 @@ define <vscale x 128 x i1> @vpmerge_nxv128i1(<vscale x 128 x i1> %va, <vscale x
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: .LBB7_2:
-; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT: sub a3, a0, a2
+; CHECK-NEXT: sltu a0, a2, a0
+; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma
; CHECK-NEXT: vmv.v.i v16, 0
-; CHECK-NEXT: sub a2, a0, a2
; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vim v24, v16, 1, v0
; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
-; CHECK-NEXT: sltu a0, a0, a2
+; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vmv1r.v v0, v4
; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma
; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0
-; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a3
; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
; CHECK-NEXT: vmsne.vi v24, v8, 0
-; CHECK-NEXT: and a0, a0, a2
; CHECK-NEXT: vmv1r.v v0, v5
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
; CHECK-NEXT: vmerge.vim v8, v16, 1, v0
@@ -547,7 +547,7 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: sub a4, a3, a1
; CHECK-NEXT: vl8r.v v24, (a2)
-; CHECK-NEXT: sltu a2, a3, a4
+; CHECK-NEXT: sltu a2, a1, a3
; CHECK-NEXT: vl8r.v v8, (a0)
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a4
@@ -583,7 +583,7 @@ define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a3, a2, a1
-; CHECK-NEXT: sltu a4, a2, a3
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma
@@ -611,7 +611,7 @@ define <vscale x 128 x i8> @vpmerge_vi_nxv128i8(<vscale x 128 x i8> %vb, <vscale
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 7e4a60095d7cc..153a0a70d098a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -2193,8 +2193,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t
; RV32-NEXT: sub a2, a1, a0
+; RV32-NEXT: sltu a1, a0, a1
; RV32-NEXT: srli a0, a0, 3
-; RV32-NEXT: sltu a1, a1, a2
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a0
@@ -2226,8 +2226,8 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t
; RV64-NEXT: sub a0, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a0
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
@@ -2263,8 +2263,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a2, a1
+; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: srli a1, a1, 3
-; RV32-NEXT: sltu a2, a2, a3
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
@@ -2298,8 +2298,8 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a3
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
@@ -2336,8 +2336,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a2, a1
+; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: srli a1, a1, 3
-; RV32-NEXT: sltu a2, a2, a3
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
@@ -2371,8 +2371,8 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a3
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
@@ -2410,8 +2410,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV32-NEXT: sub a3, a2, a1
+; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: srli a1, a1, 3
-; RV32-NEXT: sltu a2, a2, a3
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
@@ -2435,8 +2435,8 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; RV64-NEXT: vsoxei32.v v8, (a0), v24, v0.t
; RV64-NEXT: sub a3, a2, a1
+; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: srli a1, a1, 3
-; RV64-NEXT: sltu a2, a2, a3
; RV64-NEXT: addi a2, a2, -1
; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vx v0, v0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 9fd8b9d23cb5e..3468fda9011a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -373,8 +373,8 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: sub a3, a1, a2
+; CHECK-NEXT: sltu a1, a2, a1
; CHECK-NEXT: slli a2, a2, 3
-; CHECK-NEXT: sltu a1, a1, a3
; CHECK-NEXT: addi a1, a1, -1
; CHECK-NEXT: and a1, a1, a3
; CHECK-NEXT: add a0, a0, a2
@@ -409,20 +409,20 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a1), v0.t
; CHECK-NEXT: sub a0, a5, a3
-; CHECK-NEXT: srli a6, a3, 3
+; CHECK-NEXT: sltu a5, a3, a5
+; CHECK-NEXT: sub a6, a2, a4
+; CHECK-NEXT: sltu a2, a4, a2
+; CHECK-NEXT: srli a4, a3, 3
; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v7, a6
-; CHECK-NEXT: slli a6, a3, 3
-; CHECK-NEXT: sub a4, a2, a4
-; CHECK-NEXT: sltu a5, a5, a0
-; CHECK-NEXT: add a6, a1, a6
-; CHECK-NEXT: sltu a2, a2, a4
+; CHECK-NEXT: vslidedown.vx v0, v7, a4
+; CHECK-NEXT: slli a4, a3, 3
; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: add a4, a1, a4
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a5, a5, a0
-; CHECK-NEXT: and a0, a2, a4
+; CHECK-NEXT: and a0, a2, a6
; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v16, (a6), v0.t
+; CHECK-NEXT: vse64.v v16, (a4), v0.t
; CHECK-NEXT: bltu a0, a3, .LBB36_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index df97f19df7f99..4f31167b80691 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -91,7 +91,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v0, a1
; CHECK-NEXT: sub a1, a0, a2
-; CHECK-NEXT: sltu a3, a0, a1
+; CHECK-NEXT: sltu a3, a2, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a1, a3, a1
; CHECK-NEXT: bltu a0, a2, .LBB6_2
@@ -120,7 +120,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v0, a1
; CHECK-NEXT: sub a1, a0, a2
-; CHECK-NEXT: sltu a3, a0, a1
+; CHECK-NEXT: sltu a3, a2, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a1, a3, a1
; CHECK-NEXT: bltu a0, a2, .LBB7_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 7eea35afe0aa0..f2b84c28db92e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -884,7 +884,7 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v24, v0, a2
; CHECK-NEXT: sub a2, a1, a3
-; CHECK-NEXT: sltu a4, a1, a2
+; CHECK-NEXT: sltu a4, a3, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a2, a4, a2
; CHECK-NEXT: bltu a1, a3, .LBB67_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
index 1e629e9d20530..535a5bdb839e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
@@ -318,7 +318,7 @@ define zeroext i1 @vpreduce_or_nxv128i1(i1 zeroext %s, <vscale x 128 x i1> %v, <
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a1, a3
+; CHECK-NEXT: sltu a4, a2, a1
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a3, a4, a3
; CHECK-NEXT: vmv1r.v v0, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index 98634fe55de41..b4ed1857652f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -557,7 +557,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -580,7 +580,7 @@ define <vscale x 128 x i8> @vsadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -1312,7 +1312,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1335,7 +1335,7 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index a7d304261f87f..d761b8da7929c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -556,7 +556,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -579,7 +579,7 @@ define <vscale x 128 x i8> @vsaddu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
@@ -1311,7 +1311,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -1334,7 +1334,7 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index d1933560f2698..e6ef1bcf73a3d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -308,12 +308,12 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: slli a4, a3, 3
; CHECK-NEXT: slli a1, a3, 1
-; CHECK-NEXT: srli a3, a3, 2
; CHECK-NEXT: add a4, a0, a4
; CHECK-NEXT: sub a5, a2, a1
; CHECK-NEXT: vl8re32.v v24, (a4)
-; CHECK-NEXT: sltu a4, a2, a5
+; CHECK-NEXT: sltu a4, a1, a2
; CHECK-NEXT: addi a4, a4, -1
+; CHECK-NEXT: srli a3, a3, 2
; CHECK-NEXT: vl8re32.v v8, (a0)
; CHECK-NEXT: vslidedown.vx v0, v0, a3
; CHECK-NEXT: and a4, a4, a5
@@ -349,14 +349,14 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: slli a2, a1, 1
-; RV32-NEXT: sub a2, a1, a2
; RV32-NEXT: vl8re32.v v24, (a0)
-; RV32-NEXT: sltu a0, a1, a2
-; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: sub a0, a1, a2
+; RV32-NEXT: sltu a2, a2, a1
+; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
-; RV32-NEXT: and a0, a0, a2
+; RV32-NEXT: and a0, a2, a0
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; RV32-NEXT: vmerge.vvm v16, v24, v16, v0
; RV32-NEXT: ret
@@ -376,16 +376,16 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a3, a1, 3
; RV64-NEXT: slli a2, a1, 1
-; RV64-NEXT: srli a4, a1, 2
; RV64-NEXT: add a3, a0, a3
-; RV64-NEXT: sub a5, a1, a2
+; RV64-NEXT: sub a4, a1, a2
+; RV64-NEXT: sltu a5, a2, a1
; RV64-NEXT: vl8re32.v v24, (a3)
-; RV64-NEXT: sltu a3, a1, a5
-; RV64-NEXT: addi a3, a3, -1
+; RV64-NEXT: addi a5, a5, -1
+; RV64-NEXT: srli a3, a1, 2
; RV64-NEXT: vl8re32.v v8, (a0)
-; RV64-NEXT: vslidedown.vx v0, v0, a4
-; RV64-NEXT: and a3, a3, a5
-; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV64-NEXT: vslidedown.vx v0, v0, a3
+; RV64-NEXT: and a4, a5, a4
+; RV64-NEXT: vsetvli zero, a4, e32, m8, ta, ma
; RV64-NEXT: vmerge.vvm v16, v24, v16, v0
; RV64-NEXT: bltu a1, a2, .LBB28_2
; RV64-NEXT: # %bb.1:
@@ -637,10 +637,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a3, a1, 3
; CHECK-NEXT: sub a4, a2, a1
+; CHECK-NEXT: sltu a5, a1, a2
; CHECK-NEXT: add a3, a0, a3
-; CHECK-NEXT: sltu a5, a2, a4
-; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: addi a5, a5, -1
+; CHECK-NEXT: vl8re64.v v24, (a3)
; CHECK-NEXT: srli a3, a1, 3
; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: vslidedown.vx v0, v0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 07411b1c7ae08..c8bb009d2c3b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 7f96da141c363..90f1ca0843b02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFH-NEXT: slli a1, a1, 1
; ZVFH-NEXT: vslidedown.vx v0, v0, a2
; ZVFH-NEXT: sub a2, a0, a1
-; ZVFH-NEXT: sltu a3, a0, a2
+; ZVFH-NEXT: sltu a3, a1, a0
; ZVFH-NEXT: addi a3, a3, -1
; ZVFH-NEXT: and a2, a3, a2
; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFHMIN-NEXT: slli a1, a1, 1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a2, a3, a2
; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index 0ac2ef7e251c0..a6a631be9dab4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -574,7 +574,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -598,7 +598,7 @@ define <vscale x 128 x i8> @vssub_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1353,7 +1353,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1377,7 +1377,7 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index bde279a4d1f2b..1992b97e0de0d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -572,7 +572,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub a2, a1, a0
-; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: sltu a3, a0, a1
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -596,7 +596,7 @@ define <vscale x 128 x i8> @vssubu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1351,7 +1351,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
@@ -1375,7 +1375,7 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va,
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a3, a3, a2
; CHECK-NEXT: li a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index 0c1ca369521f7..0b07b60da8250 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -147,7 +147,7 @@ define <vscale x 15 x i16> @vtrunc_nxv15i16_nxv15i64(<vscale x 15 x i64> %a, <vs
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a3, a0, a1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
-; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: sltu a2, a1, a0
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma
@@ -201,7 +201,7 @@ define <vscale x 32 x i7> @vtrunc_nxv32i7_nxv32i32(<vscale x 32 x i32> %a, <vsca
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -233,7 +233,7 @@ define <vscale x 32 x i8> @vtrunc_nxv32i8_nxv32i32(<vscale x 32 x i32> %a, <vsca
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -280,11 +280,11 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: slli a3, a1, 1
; CHECK-NEXT: add a6, a0, a4
; CHECK-NEXT: sub a0, a2, a3
-; CHECK-NEXT: sltu a4, a2, a0
+; CHECK-NEXT: sltu a4, a3, a2
; CHECK-NEXT: addi a4, a4, -1
; CHECK-NEXT: and a0, a4, a0
; CHECK-NEXT: sub a4, a0, a1
-; CHECK-NEXT: sltu a7, a0, a4
+; CHECK-NEXT: sltu a7, a1, a0
; CHECK-NEXT: addi a7, a7, -1
; CHECK-NEXT: and a4, a7, a4
; CHECK-NEXT: srli a7, a1, 2
@@ -307,7 +307,7 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
; CHECK-NEXT: mv a2, a3
; CHECK-NEXT: .LBB17_4:
; CHECK-NEXT: sub a0, a2, a1
-; CHECK-NEXT: sltu a3, a2, a0
+; CHECK-NEXT: sltu a3, a1, a2
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a0, a3, a0
; CHECK-NEXT: vmv1r.v v0, v6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index c0c749ebf3186..807c2d9fa3ce6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -478,7 +478,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFH-NEXT: slli a1, a1, 1
; ZVFH-NEXT: vslidedown.vx v0, v0, a2
; ZVFH-NEXT: sub a2, a0, a1
-; ZVFH-NEXT: sltu a3, a0, a2
+; ZVFH-NEXT: sltu a3, a1, a0
; ZVFH-NEXT: addi a3, a3, -1
; ZVFH-NEXT: and a2, a3, a2
; ZVFH-NEXT: vsetvli zero, a2, e16, m4, ta, ma
@@ -502,7 +502,7 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
; ZVFHMIN-NEXT: slli a1, a1, 1
; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a2
; ZVFHMIN-NEXT: sub a2, a0, a1
-; ZVFHMIN-NEXT: sltu a3, a0, a2
+; ZVFHMIN-NEXT: sltu a3, a1, a0
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a2, a3, a2
; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -534,7 +534,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -557,7 +557,7 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 9713b617b8384..44a1084b4a208 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -144,7 +144,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: vslidedown.vx v0, v0, a2
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
@@ -168,7 +168,7 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub a2, a0, a1
-; CHECK-NEXT: sltu a3, a0, a2
+; CHECK-NEXT: sltu a3, a1, a0
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a2, a3, a2
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/usub_sat.ll b/llvm/test/CodeGen/RISCV/usub_sat.ll
index 33056682dcc79..6fcc6bc5f3dcd 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat.ll
@@ -7,10 +7,10 @@
define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
; RV32I-LABEL: func:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func:
@@ -57,10 +57,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: func2:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func2:
@@ -93,18 +93,18 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
; RV32I-LABEL: func16:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func16:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func16:
@@ -125,18 +125,18 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
; RV32I-LABEL: func8:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func8:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func8:
@@ -157,18 +157,18 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
; RV32I-LABEL: func3:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func3:
; RV64I: # %bb.0:
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func3:
diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
index ef6bc022ddc9f..838f2dbe2276d 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -8,10 +8,10 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
; RV32I-LABEL: func32:
; RV32I: # %bb.0:
; RV32I-NEXT: mul a1, a1, a2
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func32:
@@ -65,7 +65,7 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
; RV64I-LABEL: func64:
; RV64I: # %bb.0:
; RV64I-NEXT: sub a1, a0, a2
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sltu a0, a2, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: ret
@@ -106,10 +106,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
; RV32I-NEXT: addi a3, a3, -1
; RV32I-NEXT: and a0, a0, a3
; RV32I-NEXT: and a1, a1, a3
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func16:
@@ -119,10 +119,10 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
; RV64I-NEXT: addi a3, a3, -1
; RV64I-NEXT: and a0, a0, a3
; RV64I-NEXT: and a1, a1, a3
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func16:
@@ -153,10 +153,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
; RV32I-NEXT: zext.b a0, a0
; RV32I-NEXT: mul a1, a1, a2
; RV32I-NEXT: zext.b a1, a1
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func8:
@@ -164,10 +164,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
; RV64I-NEXT: zext.b a0, a0
; RV64I-NEXT: mul a1, a1, a2
; RV64I-NEXT: zext.b a1, a1
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func8:
@@ -198,10 +198,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
; RV32I-NEXT: andi a0, a0, 15
; RV32I-NEXT: mul a1, a1, a2
; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV64I-LABEL: func4:
@@ -209,10 +209,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
; RV64I-NEXT: andi a0, a0, 15
; RV64I-NEXT: mul a1, a1, a2
; RV64I-NEXT: andi a1, a1, 15
-; RV64I-NEXT: sub a1, a0, a1
-; RV64I-NEXT: sltu a0, a0, a1
+; RV64I-NEXT: sub a2, a0, a1
+; RV64I-NEXT: sltu a0, a1, a0
; RV64I-NEXT: addi a0, a0, -1
-; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: and a0, a0, a2
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func4:
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 62f08d7831dda..0de2cbd76b749 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -937,9 +937,10 @@ entry:
define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
; RV32-LABEL: usubo.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: sw a1, 0(a2)
+; RV32-NEXT: sltu a3, a1, a0
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: sw a0, 0(a2)
+; RV32-NEXT: mv a0, a3
; RV32-NEXT: ret
;
; RV64-LABEL: usubo.i32:
@@ -951,9 +952,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZBA-LABEL: usubo.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: sltu a0, a0, a1
-; RV32ZBA-NEXT: sw a1, 0(a2)
+; RV32ZBA-NEXT: sltu a3, a1, a0
+; RV32ZBA-NEXT: sub a0, a0, a1
+; RV32ZBA-NEXT: sw a0, 0(a2)
+; RV32ZBA-NEXT: mv a0, a3
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: usubo.i32:
@@ -965,9 +967,10 @@ define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZICOND-LABEL: usubo.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: sltu a0, a0, a1
-; RV32ZICOND-NEXT: sw a1, 0(a2)
+; RV32ZICOND-NEXT: sltu a3, a1, a0
+; RV32ZICOND-NEXT: sub a0, a0, a1
+; RV32ZICOND-NEXT: sw a0, 0(a2)
+; RV32ZICOND-NEXT: mv a0, a3
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: usubo.i32:
@@ -987,9 +990,11 @@ entry:
define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
; RV32-LABEL: usubo.i32.constant.rhs:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: addi a2, a0, 2
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: sw a2, 0(a1)
+; RV32-NEXT: addi a2, a0, 1
+; RV32-NEXT: seqz a2, a2
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: sw a0, 0(a1)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: usubo.i32.constant.rhs:
@@ -1001,9 +1006,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
;
; RV32ZBA-LABEL: usubo.i32.constant.rhs:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: addi a2, a0, 2
-; RV32ZBA-NEXT: sltu a0, a0, a2
-; RV32ZBA-NEXT: sw a2, 0(a1)
+; RV32ZBA-NEXT: addi a2, a0, 1
+; RV32ZBA-NEXT: seqz a2, a2
+; RV32ZBA-NEXT: addi a0, a0, 2
+; RV32ZBA-NEXT: sw a0, 0(a1)
+; RV32ZBA-NEXT: mv a0, a2
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: usubo.i32.constant.rhs:
@@ -1015,9 +1022,11 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
;
; RV32ZICOND-LABEL: usubo.i32.constant.rhs:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: addi a2, a0, 2
-; RV32ZICOND-NEXT: sltu a0, a0, a2
-; RV32ZICOND-NEXT: sw a2, 0(a1)
+; RV32ZICOND-NEXT: addi a2, a0, 1
+; RV32ZICOND-NEXT: seqz a2, a2
+; RV32ZICOND-NEXT: addi a0, a0, 2
+; RV32ZICOND-NEXT: sw a0, 0(a1)
+; RV32ZICOND-NEXT: mv a0, a2
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: usubo.i32.constant.rhs:
@@ -1039,8 +1048,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV32: # %bb.0: # %entry
; RV32-NEXT: li a2, -2
; RV32-NEXT: sub a2, a2, a0
-; RV32-NEXT: addi a0, a2, 1
-; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: sltiu a0, a0, -2
; RV32-NEXT: sw a2, 0(a1)
; RV32-NEXT: ret
;
@@ -1057,8 +1065,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: li a2, -2
; RV32ZBA-NEXT: sub a2, a2, a0
-; RV32ZBA-NEXT: addi a0, a2, 1
-; RV32ZBA-NEXT: seqz a0, a0
+; RV32ZBA-NEXT: sltiu a0, a0, -2
; RV32ZBA-NEXT: sw a2, 0(a1)
; RV32ZBA-NEXT: ret
;
@@ -1075,8 +1082,7 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV32ZICOND: # %bb.0: # %entry
; RV32ZICOND-NEXT: li a2, -2
; RV32ZICOND-NEXT: sub a2, a2, a0
-; RV32ZICOND-NEXT: addi a0, a2, 1
-; RV32ZICOND-NEXT: seqz a0, a0
+; RV32ZICOND-NEXT: sltiu a0, a0, -2
; RV32ZICOND-NEXT: sw a2, 0(a1)
; RV32ZICOND-NEXT: ret
;
@@ -1116,9 +1122,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64-LABEL: usubo.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: sd a1, 0(a2)
+; RV64-NEXT: sltu a3, a1, a0
+; RV64-NEXT: sub a0, a0, a1
+; RV64-NEXT: sd a0, 0(a2)
+; RV64-NEXT: mv a0, a3
; RV64-NEXT: ret
;
; RV32ZBA-LABEL: usubo.i64:
@@ -1140,9 +1147,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZBA-LABEL: usubo.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: sltu a0, a0, a1
-; RV64ZBA-NEXT: sd a1, 0(a2)
+; RV64ZBA-NEXT: sltu a3, a1, a0
+; RV64ZBA-NEXT: sub a0, a0, a1
+; RV64ZBA-NEXT: sd a0, 0(a2)
+; RV64ZBA-NEXT: mv a0, a3
; RV64ZBA-NEXT: ret
;
; RV32ZICOND-LABEL: usubo.i64:
@@ -1163,9 +1171,10 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZICOND-LABEL: usubo.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: sltu a0, a0, a1
-; RV64ZICOND-NEXT: sd a1, 0(a2)
+; RV64ZICOND-NEXT: sltu a3, a1, a0
+; RV64ZICOND-NEXT: sub a0, a0, a1
+; RV64ZICOND-NEXT: sd a0, 0(a2)
+; RV64ZICOND-NEXT: mv a0, a3
; RV64ZICOND-NEXT: ret
entry:
%t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2810,8 +2819,7 @@ entry:
define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: usubo.select.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a2, a0, a1
-; RV32-NEXT: bltu a0, a2, .LBB40_2
+; RV32-NEXT: bltu a1, a0, .LBB40_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: mv a0, a1
; RV32-NEXT: .LBB40_2: # %entry
@@ -2828,8 +2836,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: usubo.select.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a2, a0, a1
-; RV32ZBA-NEXT: bltu a0, a2, .LBB40_2
+; RV32ZBA-NEXT: bltu a1, a0, .LBB40_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: mv a0, a1
; RV32ZBA-NEXT: .LBB40_2: # %entry
@@ -2846,8 +2853,7 @@ define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: usubo.select.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a2, a0, a1
-; RV32ZICOND-NEXT: sltu a2, a0, a2
+; RV32ZICOND-NEXT: sltu a2, a1, a0
; RV32ZICOND-NEXT: czero.nez a1, a1, a2
; RV32ZICOND-NEXT: czero.eqz a0, a0, a2
; RV32ZICOND-NEXT: or a0, a0, a1
@@ -2871,8 +2877,7 @@ entry:
define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: usubo.not.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: sltu a0, a0, a1
+; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: ret
;
@@ -2885,8 +2890,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: usubo.not.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: sltu a0, a0, a1
+; RV32ZBA-NEXT: sltu a0, a1, a0
; RV32ZBA-NEXT: xori a0, a0, 1
; RV32ZBA-NEXT: ret
;
@@ -2899,8 +2903,7 @@ define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: usubo.not.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: sltu a0, a0, a1
+; RV32ZICOND-NEXT: sltu a0, a1, a0
; RV32ZICOND-NEXT: xori a0, a0, 1
; RV32ZICOND-NEXT: ret
;
@@ -2940,8 +2943,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: usubo.select.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a2, a0, a1
-; RV64-NEXT: bltu a0, a2, .LBB42_2
+; RV64-NEXT: bltu a1, a0, .LBB42_2
; RV64-NEXT: # %bb.1: # %entry
; RV64-NEXT: mv a0, a1
; RV64-NEXT: .LBB42_2: # %entry
@@ -2969,8 +2971,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: usubo.select.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a2, a0, a1
-; RV64ZBA-NEXT: bltu a0, a2, .LBB42_2
+; RV64ZBA-NEXT: bltu a1, a0, .LBB42_2
; RV64ZBA-NEXT: # %bb.1: # %entry
; RV64ZBA-NEXT: mv a0, a1
; RV64ZBA-NEXT: .LBB42_2: # %entry
@@ -2998,8 +2999,7 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: usubo.select.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a2, a0, a1
-; RV64ZICOND-NEXT: sltu a2, a0, a2
+; RV64ZICOND-NEXT: sltu a2, a1, a0
; RV64ZICOND-NEXT: czero.nez a1, a1, a2
; RV64ZICOND-NEXT: czero.eqz a0, a0, a2
; RV64ZICOND-NEXT: or a0, a0, a1
@@ -3030,8 +3030,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: usubo.not.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: sltu a0, a0, a1
+; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: xori a0, a0, 1
; RV64-NEXT: ret
;
@@ -3053,8 +3052,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: usubo.not.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: sltu a0, a0, a1
+; RV64ZBA-NEXT: sltu a0, a1, a0
; RV64ZBA-NEXT: xori a0, a0, 1
; RV64ZBA-NEXT: ret
;
@@ -3075,8 +3073,7 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: usubo.not.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: sltu a0, a0, a1
+; RV64ZICOND-NEXT: sltu a0, a1, a0
; RV64ZICOND-NEXT: xori a0, a0, 1
; RV64ZICOND-NEXT: ret
entry:
@@ -4379,8 +4376,7 @@ continue:
define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: usubo.br.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: sub a1, a0, a1
-; RV32-NEXT: bgeu a0, a1, .LBB58_2
+; RV32-NEXT: bgeu a1, a0, .LBB58_2
; RV32-NEXT: # %bb.1: # %overflow
; RV32-NEXT: li a0, 0
; RV32-NEXT: ret
@@ -4401,8 +4397,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: usubo.br.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: sub a1, a0, a1
-; RV32ZBA-NEXT: bgeu a0, a1, .LBB58_2
+; RV32ZBA-NEXT: bgeu a1, a0, .LBB58_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
; RV32ZBA-NEXT: li a0, 0
; RV32ZBA-NEXT: ret
@@ -4423,8 +4418,7 @@ define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: usubo.br.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: sub a1, a0, a1
-; RV32ZICOND-NEXT: bgeu a0, a1, .LBB58_2
+; RV32ZICOND-NEXT: bgeu a1, a0, .LBB58_2
; RV32ZICOND-NEXT: # %bb.1: # %overflow
; RV32ZICOND-NEXT: li a0, 0
; RV32ZICOND-NEXT: ret
@@ -4478,8 +4472,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: usubo.br.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: sub a1, a0, a1
-; RV64-NEXT: bgeu a0, a1, .LBB59_2
+; RV64-NEXT: bgeu a1, a0, .LBB59_2
; RV64-NEXT: # %bb.1: # %overflow
; RV64-NEXT: li a0, 0
; RV64-NEXT: ret
@@ -4509,8 +4502,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: usubo.br.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: sub a1, a0, a1
-; RV64ZBA-NEXT: bgeu a0, a1, .LBB59_2
+; RV64ZBA-NEXT: bgeu a1, a0, .LBB59_2
; RV64ZBA-NEXT: # %bb.1: # %overflow
; RV64ZBA-NEXT: li a0, 0
; RV64ZBA-NEXT: ret
@@ -4540,8 +4532,7 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: usubo.br.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: sub a1, a0, a1
-; RV64ZICOND-NEXT: bgeu a0, a1, .LBB59_2
+; RV64ZICOND-NEXT: bgeu a1, a0, .LBB59_2
; RV64ZICOND-NEXT: # %bb.1: # %overflow
; RV64ZICOND-NEXT: li a0, 0
; RV64ZICOND-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index 3bbf33328f529..6d5fc765c49a8 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -71,10 +71,10 @@ define i32 @subsat(i32 %a, i32 %b) {
define i32 @subusat(i32 %a, i32 %b) {
; RV32I-LABEL: subusat:
; RV32I: # %bb.0:
-; RV32I-NEXT: sub a1, a0, a1
-; RV32I-NEXT: sltu a0, a0, a1
+; RV32I-NEXT: sub a2, a0, a1
+; RV32I-NEXT: sltu a0, a1, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: and a0, a0, a2
; RV32I-NEXT: ret
;
; RV32IXQCIA-LABEL: subusat:
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index fc73ce5503ffe..da2123a5dfe74 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1068,12 +1068,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
; CHECK-LABEL: @extract_value_usub(
; CHECK-NEXT: [[Z:%.*]] = add nuw i8 [[ZZ:%.*]], 1
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z]]
-; CHECK-NEXT: [[SUB:%.*]] = xor i8 [[ZZ]], -1
-; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[ZZ]], -1
-; CHECK-NEXT: ret i1 [[R]]
+; CHECK-NEXT: ret i1 false
;
%z = add nuw i8 %zz, 1
%y = add i8 %x, %z
@@ -1090,11 +1090,12 @@ define i1 @extract_value_usub(i8 %x, i8 %zz) {
define i1 @extract_value_usub_fail(i8 %x, i8 %z) {
; CHECK-LABEL: @extract_value_usub_fail(
; CHECK-NEXT: [[Y:%.*]] = add i8 [[X:%.*]], [[Z:%.*]]
-; CHECK-NEXT: [[SUB:%.*]] = sub i8 0, [[Z]]
-; CHECK-NEXT: [[UOV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SUB_UOV:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 0
+; CHECK-NEXT: [[UOV:%.*]] = extractvalue { i8, i1 } [[SUB_UOV]], 1
; CHECK-NEXT: call void @use.i1(i1 [[UOV]])
; CHECK-NEXT: call void @use.i8(i8 [[SUB]])
-; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[Z]], 0
+; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[SUB]], 0
; CHECK-NEXT: ret i1 [[R]]
;
%y = add i8 %x, %z
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
index 62a332e14b04a..3224b8b63afd3 100644
--- a/llvm/test/Transforms/InstCombine/pr170634.ll
+++ b/llvm/test/Transforms/InstCombine/pr170634.ll
@@ -3,12 +3,13 @@
define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
; CHECK-LABEL: @func(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
; CHECK: if.then:
; CHECK-NEXT: br label [[RETURN:%.*]]
; CHECK: if.end:
-; CHECK-NEXT: [[TMP1:%.*]] = sub nuw i64 [[X]], [[Y]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
; CHECK-NEXT: br label [[RETURN]]
; CHECK: return:
; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index f8b318bc3680a..30a5072c7edc8 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -141,16 +141,16 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
define i1 @t2(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -168,16 +168,16 @@ define i1 @t2(i8 %base, i8 %offset) {
define i1 @t2_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t2_logical(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -321,16 +321,16 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
define i1 @t6_commutability(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -348,16 +348,16 @@ define i1 @t6_commutability(i8 %base, i8 %offset) {
define i1 @t6_commutability_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t6_commutability_logical(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
; CHECK-NEXT: [[NO_UNDERFLOW:%.*]] = xor i1 [[UNDERFLOW]], true
; CHECK-NEXT: call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -459,14 +459,14 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
define i1 @t8(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
@@ -482,14 +482,14 @@ define i1 @t8(i8 %base, i8 %offset) {
define i1 @t8_logical(i8 %base, i8 %offset) {
; CHECK-LABEL: @t8_logical(
-; CHECK-NEXT: [[ADJUSTED:%.*]] = sub i8 [[BASE:%.*]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i8, i1 } poison, i8 [[ADJUSTED]], 0
-; CHECK-NEXT: [[AGG:%.*]] = insertvalue { i8, i1 } [[TMP3]], i1 [[UNDERFLOW]], 1
+; CHECK-NEXT: [[AGG:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[BASE:%.*]], i8 [[OFFSET:%.*]])
; CHECK-NEXT: call void @useagg({ i8, i1 } [[AGG]])
+; CHECK-NEXT: [[ADJUSTED:%.*]] = extractvalue { i8, i1 } [[AGG]], 0
; CHECK-NEXT: call void @use8(i8 [[ADJUSTED]])
+; CHECK-NEXT: [[UNDERFLOW:%.*]] = extractvalue { i8, i1 } [[AGG]], 1
; CHECK-NEXT: call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
+; CHECK-NEXT: [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT: [[R:%.*]] = or i1 [[NULL]], [[UNDERFLOW]]
; CHECK-NEXT: ret i1 [[R]]
;
%agg = call {i8, i1} @llvm.usub.with.overflow(i8 %base, i8 %offset)
diff --git a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
index c9030e5ab0321..90ca39a70a0bb 100644
--- a/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
+++ b/llvm/test/Transforms/InstCombine/usub-overflow-known-by-implied-cond.ll
@@ -175,10 +175,11 @@ define i32 @test7(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -204,10 +205,11 @@ define i32 @test8(i32 %a, i32 %b) {
; CHECK-NEXT: [[COND_NOT:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: br i1 [[COND_NOT]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -294,10 +296,11 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = and i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -325,10 +328,11 @@ define i32 @test10_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[AND:%.*]] = select i1 [[COND]], i1 [[COND2:%.*]], i1 false
; CHECK-NEXT: br i1 [[AND]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -356,10 +360,11 @@ define i32 @test11(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -387,10 +392,11 @@ define i32 @test11_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -418,10 +424,11 @@ define i32 @test12(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = or i1 [[COND]], [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
@@ -449,10 +456,11 @@ define i32 @test12_logical(i32 %a, i32 %b, i1 %cond2) {
; CHECK-NEXT: [[OR:%.*]] = select i1 [[COND]], i1 true, i1 [[COND2:%.*]]
; CHECK-NEXT: br i1 [[OR]], label [[BB3:%.*]], label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: [[SUB1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT: [[C1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 1
; CHECK-NEXT: br i1 [[C1]], label [[BB3]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[R1:%.*]] = sub nuw i32 [[A]], [[B]]
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[SUB1]], 0
; CHECK-NEXT: ret i32 [[R1]]
; CHECK: bb3:
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll
index e4b9c0e08ba22..2074190a2cd45 100644
--- a/llvm/test/Transforms/InstCombine/usubo.ll
+++ b/llvm/test/Transforms/InstCombine/usubo.ll
@@ -130,9 +130,10 @@ define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) {
define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_eq1(
-; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
; CHECK-NEXT: call void @use(i1 [[OV]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1
; CHECK-NEXT: ret i1 [[EQ1]]
;
@@ -148,9 +149,10 @@ define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) {
define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) {
; CHECK-LABEL: @sub_sgt0(
-; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1
; CHECK-NEXT: call void @use(i1 [[OV]])
+; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0
; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0
; CHECK-NEXT: ret i1 [[SGT0]]
;
diff --git a/llvm/test/Transforms/InstCombine/with_overflow.ll b/llvm/test/Transforms/InstCombine/with_overflow.ll
index 0c82bdc256ddf..09ef32262ea78 100644
--- a/llvm/test/Transforms/InstCombine/with_overflow.ll
+++ b/llvm/test/Transforms/InstCombine/with_overflow.ll
@@ -506,10 +506,7 @@ define { i32, i1 } @ssub_no_canonicalize_constant_arg0(i32 %x) nounwind {
define { i32, i1 } @usub_no_canonicalize_constant_arg0(i32 %x) nounwind {
; CHECK-LABEL: @usub_no_canonicalize_constant_arg0(
-; CHECK-NEXT: [[TMP1:%.*]] = sub i32 42, [[X:%.*]]
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 42, [[X]]
-; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
-; CHECK-NEXT: [[A:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP2]], 1
+; CHECK-NEXT: [[A:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 [[X:%.*]])
; CHECK-NEXT: ret { i32, i1 } [[A]]
;
%a = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 42, i32 %x)
>From 42078e78872130250e5d2cb56d44cbf979277124 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Sun, 7 Dec 2025 00:22:30 +0530
Subject: [PATCH 5/8] updated testcase
---
.../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 166 ++--
.../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 24 +-
llvm/test/CodeGen/AMDGPU/addsub64_carry.ll | 85 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 889 ++++++++++++++++--
llvm/test/CodeGen/AMDGPU/usubo.ll | 196 ++--
llvm/test/CodeGen/AMDGPU/usubsat.ll | 54 +-
llvm/test/CodeGen/ARM/addsubo-legalization.ll | 22 +-
.../LoongArch/atomicrmw-cond-sub-clamp.ll | 24 +-
.../PowerPC/atomicrmw-cond-sub-clamp.ll | 23 +-
llvm/test/CodeGen/RISCV/pr170634.ll | 22 +
.../CodeGen/Thumb2/mve-saturating-arith.ll | 40 +-
.../VE/Scalar/atomicrmw-cond-sub-clamp.ll | 20 +-
.../WebAssembly/atomicrmw-cond-sub-clamp.ll | 40 +-
llvm/test/CodeGen/X86/combine-addo.ll | 13 +-
llvm/test/CodeGen/X86/combine-subo.ll | 4 +-
llvm/test/CodeGen/X86/vec_usubo.ll | 741 ++++++++-------
llvm/test/Transforms/InstCombine/pr170634.ll | 34 -
17 files changed, 1574 insertions(+), 823 deletions(-)
create mode 100644 llvm/test/CodeGen/RISCV/pr170634.ll
delete mode 100644 llvm/test/Transforms/InstCombine/pr170634.ll
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index b8962fa29e8f1..67c053ce2ba1a 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -8742,30 +8742,31 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB113_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB113_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
@@ -8776,26 +8777,27 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB113_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB113_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
@@ -8809,30 +8811,31 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
-; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB113_4
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: .LBB113_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
@@ -8843,26 +8846,26 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB113_4: ; %Flow3
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB113_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
@@ -8881,28 +8884,29 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:7]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB114_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
@@ -8911,22 +8915,23 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execnz .LBB114_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB114_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB114_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v6
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi
@@ -8960,9 +8965,10 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8986,6 +8992,7 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
@@ -17059,8 +17066,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17079,19 +17087,20 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v4
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
@@ -17124,9 +17133,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17150,11 +17160,11 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
; GFX950-NEXT: ;;#ASMSTART
@@ -17192,8 +17202,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17216,9 +17227,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc
; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB222_6: ; %atomicrmw.phi
@@ -17251,9 +17263,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17275,6 +17288,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index b6fe0c756a106..53270d0c36ae1 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -5810,8 +5810,9 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5844,9 +5845,10 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5883,8 +5885,9 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -5913,9 +5916,10 @@ define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -11575,8 +11579,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11610,9 +11615,10 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11650,8 +11656,9 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -11681,9 +11688,10 @@ define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index 8088c1b4c8fc7..46f1662b417bb 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -57,15 +57,17 @@ define <2 x i64> @v_usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
; CHECK-LABEL: v_usub_v2i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
-; CHECK-NEXT: v_sub_co_u32_e64 v4, s[4:5], v0, v4
-; CHECK-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
-; CHECK-NEXT: v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CHECK-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v6
+; CHECK-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v7, vcc
+; CHECK-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v4
+; CHECK-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v5, vcc
+; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[10:13]
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[6:7]
; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CHECK-NEXT: v_mov_b32_e32 v3, v2
-; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -138,9 +140,10 @@ define i64 @v_usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
; CHECK-LABEL: v_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v0
-; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 1, v[0:1]
+; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -157,11 +160,11 @@ define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
; CHECK-LABEL: v_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, -1, v0
-; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
@@ -225,20 +228,24 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s6, s2, s6
-; CHECK-NEXT: s_subb_u32 s7, s3, s7
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_sub_u32 s0, s0, s4
-; CHECK-NEXT: s_subb_u32 s1, s1, s5
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3]
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v7
-; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: v_mov_b32_e32 v9, s7
+; CHECK-NEXT: v_mov_b32_e32 v8, s6
+; CHECK-NEXT: v_mov_b32_e32 v7, s5
+; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[8:9]
+; CHECK-NEXT: v_mov_b32_e32 v6, s4
+; CHECK-NEXT: s_sub_u32 s8, s2, s6
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[6:7]
+; CHECK-NEXT: s_subb_u32 s9, s3, s7
+; CHECK-NEXT: s_sub_u32 s10, s0, s4
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
+; CHECK-NEXT: s_subb_u32 s11, s1, s5
+; CHECK-NEXT: v_readfirstlane_b32 s0, v6
+; CHECK-NEXT: v_readfirstlane_b32 s2, v8
+; CHECK-NEXT: v_mov_b32_e32 v2, s10
+; CHECK-NEXT: v_mov_b32_e32 v3, s11
+; CHECK-NEXT: v_mov_b32_e32 v4, s8
+; CHECK-NEXT: v_mov_b32_e32 v5, s9
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -322,11 +329,11 @@ define amdgpu_ps i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s0, s0, 1
-; CHECK-NEXT: s_subb_u32 s1, s1, 0
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_add_u32 s2, s0, -1
+; CHECK-NEXT: s_addc_u32 s3, s1, -1
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 1
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -344,15 +351,13 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s0, s0, -1
-; CHECK-NEXT: s_subb_u32 s1, s1, -1
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_add_u32 s0, s0, 1
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: ; return to shader part epilog
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 163d7ff9c61fc..19b801a840ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -128,6 +128,31 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: sadd64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%add = add i64 %a, %b
store i64 %add, ptr addrspace(1) %out
@@ -238,6 +263,30 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: sadd64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%add = add i64 20015998343286, %a
store i64 %add, ptr addrspace(1) %out
@@ -340,6 +389,29 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: vadd64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -442,6 +514,26 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: vadd64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE2]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -550,6 +642,24 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: suaddo32
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+; GCN-ISEL-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -686,6 +796,35 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: uaddo32_vcc_user
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
+; GCN-ISEL-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[V_ADD_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADD_CO_U32_e64_1]], implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -832,6 +971,38 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: suaddo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+; GCN-ISEL-NEXT: [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO killed [[COPY10]], killed [[COPY12]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO killed [[COPY9]], killed [[COPY11]], killed [[S_UADDO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_UADDO]], %subreg.sub0, killed [[S_ADD_C]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE4]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY13]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_ADD_C1]], implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -978,6 +1149,40 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: vuaddo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY3]], [[COPY1]](s32), 0, implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
+; GCN-ISEL-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY12]], [[COPY13]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADDC_U32_e64_1]], implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -1105,6 +1310,31 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: ssub64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%sub = sub i64 %a, %b
store i64 %sub, ptr addrspace(1) %out
@@ -1215,6 +1445,30 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: ssub64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%sub = sub i64 20015998343286, %a
store i64 %sub, ptr addrspace(1) %out
@@ -1317,6 +1571,29 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1]
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: vsub64rr
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1419,6 +1696,26 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: vsub64ri
+; GCN-ISEL: bb.0.entry:
+; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
+; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1528,6 +1825,24 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: susubo32
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+; GCN-ISEL-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -1664,6 +1979,35 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: usubo32_vcc_user
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
+; GCN-ISEL-NEXT: [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[V_SUB_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_SUB_CO_U32_e64_1]], implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -1685,20 +2029,21 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s11, 0xf000
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
-; CISI-NEXT: s_sub_u32 s4, s4, s6
-; CISI-NEXT: s_subb_u32 s5, s5, s7
+; CISI-NEXT: v_mov_b32_e32 v0, s6
+; CISI-NEXT: v_mov_b32_e32 v1, s7
+; CISI-NEXT: s_sub_u32 s6, s4, s6
+; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; CISI-NEXT: s_subb_u32 s7, s5, s7
+; CISI-NEXT: v_mov_b32_e32 v2, s6
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
-; CISI-NEXT: v_mov_b32_e32 v0, s4
-; CISI-NEXT: v_mov_b32_e32 v1, s5
-; CISI-NEXT: s_cselect_b64 s[4:5], -1, 0
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s10
; CISI-NEXT: s_mov_b32 s3, s11
-; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; CISI-NEXT: s_waitcnt expcnt(0)
-; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; CISI-NEXT: v_mov_b32_e32 v3, s7
+; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; CISI-NEXT: s_endpgm
;
@@ -1707,16 +2052,18 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
@@ -1725,12 +2072,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s14
+; GFX9-NEXT: v_mov_b32_e32 v1, s15
; GFX9-NEXT: s_sub_u32 s0, s12, s14
; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: global_store_byte v2, v3, s[10:11]
; GFX9-NEXT: s_endpgm
@@ -1743,8 +2092,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
; GFX1010-NEXT: s_subb_u32 s1, s13, s15
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
-; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX1010-NEXT: global_store_byte v2, v3, s[10:11]
@@ -1755,11 +2104,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030W32-NEXT: s_sub_u32 s8, s4, s6
+; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
+; GFX1030W32-NEXT: s_subb_u32 s9, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
@@ -1770,11 +2119,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX1030W64-NEXT: s_sub_u32 s8, s4, s6
+; GFX1030W64-NEXT: s_subb_u32 s9, s5, s7
+; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[4:5], s[6:7]
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s8
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s9
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
@@ -1784,11 +2133,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_subb_u32 s5, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: s_sub_u32 s8, s4, s6
+; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
+; GFX11-NEXT: s_subb_u32 s9, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -1799,17 +2148,51 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_sub_nc_u64 s[0:1], s[12:13], s[14:15]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: susubo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+; GCN-ISEL-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE5]]
+; GCN-ISEL-NEXT: [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[COPY13]], implicit $exec
+; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY14]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -1829,21 +2212,22 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI: ; %bb.0:
; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; CISI-NEXT: v_mov_b32_e32 v1, 0
; CISI-NEXT: s_mov_b32 s7, 0xf000
; CISI-NEXT: s_mov_b32 s6, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_mov_b32 s4, s0
-; CISI-NEXT: v_mov_b32_e32 v1, s9
-; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
+; CISI-NEXT: v_mov_b32_e32 v3, s9
+; CISI-NEXT: v_sub_i32_e32 v2, vcc, s8, v0
+; CISI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
; CISI-NEXT: s_mov_b32 s5, s1
-; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
; CISI-NEXT: s_mov_b32 s3, s7
-; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CISI-NEXT: s_waitcnt expcnt(0)
; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; CISI-NEXT: s_endpgm
;
@@ -1851,31 +2235,34 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v6, s5
-; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0
-; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v7, s5
+; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v0
+; VI-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
-; VI-NEXT: flat_store_byte v[3:4], v0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[6:7]
+; VI-NEXT: flat_store_byte v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vusubo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v0
+; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vusubo64:
@@ -1883,13 +2270,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_clause 0x1
; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1010-NEXT: v_mov_b32_e32 v2, 0
+; GFX1010-NEXT: v_mov_b32_e32 v1, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX1010-NEXT: v_sub_co_u32 v2, s4, s6, v0
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v3, s4, s7, 0, s4
+; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1010-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX1010-NEXT: global_store_byte v1, v0, s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vusubo64:
@@ -1897,13 +2285,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX1030W32-NEXT: v_sub_co_u32 v2, s4, s6, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1030W32-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX1030W32-NEXT: global_store_byte v1, v0, s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vusubo64:
@@ -1911,13 +2300,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX1030W64-NEXT: v_sub_co_u32 v2, s[4:5], s6, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v3, null, s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX1030W64-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX1030W64-NEXT: global_store_byte v1, v0, s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vusubo64:
@@ -1925,17 +2315,16 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX11-NEXT: v_sub_co_u32 v2, s4, s6, v0
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: vusubo64:
@@ -1944,18 +2333,50 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_sub_co_u32 v0, s4, s6, v0
-; GFX1250-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
+; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX1250-NEXT: s_endpgm
+; GCN-ISEL-LABEL: name: vusubo64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+; GCN-ISEL-NEXT: [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit $exec
+; GCN-ISEL-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $vcc, implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
+; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -3205,6 +3626,292 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: .LBB16_4:
; GFX1250-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1250-NEXT: s_branch .LBB16_2
+; GCN-ISEL-LABEL: name: sudiv64
+; GCN-ISEL: bb.0 (%ir-block.0):
+; GCN-ISEL-NEXT: successors: %bb.3(0x50000000), %bb.1(0x30000000)
+; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_192 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3, [[COPY2]], %subreg.sub4, [[COPY1]], %subreg.sub5
+; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sgpr_192 = COPY [[REG_SEQUENCE]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]]
+; GCN-ISEL-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_]], %subreg.sub0, killed [[COPY10]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
+; GCN-ISEL-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 killed [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+; GCN-ISEL-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+; GCN-ISEL-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[V_CMP_NE_U64_e64_]], implicit-def dead $scc
+; GCN-ISEL-NEXT: $vcc = COPY [[S_AND_B64_]]
+; GCN-ISEL-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+; GCN-ISEL-NEXT: S_BRANCH %bb.1
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: bb.1.Flow:
+; GCN-ISEL-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.0, %6, %bb.3
+; GCN-ISEL-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_1]], %bb.0, %40, %bb.3
+; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI1]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
+; GCN-ISEL-NEXT: S_CMP_LG_U32 killed [[COPY12]], killed [[S_MOV_B32_1]], implicit-def $scc
+; GCN-ISEL-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc
+; GCN-ISEL-NEXT: S_BRANCH %bb.2
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: bb.2 (%ir-block.7):
+; GCN-ISEL-NEXT: successors: %bb.4(0x80000000)
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
+; GCN-ISEL-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[S_MOV_B32_2]], [[COPY13]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY13]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[V_RCP_IFLAG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 killed [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1333788670, killed [[V_RCP_IFLAG_F32_e32_]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed [[V_MUL_F32_e32_]], implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_SUB_I32_]], [[COPY15]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e32_]], killed [[S_MUL_I32_]], implicit $exec
+; GCN-ISEL-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
+; GCN-ISEL-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_]]
+; GCN-ISEL-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY16]], killed [[COPY17]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY killed [[S_ADD_I32_]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY14]], [[COPY18]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+; GCN-ISEL-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
+; GCN-ISEL-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY19]], [[S_MOV_B32_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_1:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY20]], [[COPY13]]
+; GCN-ISEL-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY14]], killed [[S_MUL_I32_1]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_SUB_I32_1]], [[COPY13]], implicit-def dead $scc
+; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_SUB_I32_1]], [[COPY13]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_SUB_I32_2]], [[S_SUB_I32_1]], implicit $scc
+; GCN-ISEL-NEXT: [[COPY21:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_1]], [[COPY21]], implicit $scc
+; GCN-ISEL-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_CSELECT_B32_1]], [[S_MOV_B32_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: S_CMP_GE_U32 killed [[S_CSELECT_B32_]], [[COPY13]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_2]], [[S_CSELECT_B32_1]], implicit $scc
+; GCN-ISEL-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_2]], %subreg.sub0, killed [[S_MOV_B32_4]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE4]]
+; GCN-ISEL-NEXT: S_BRANCH %bb.4
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: bb.3 (%ir-block.12):
+; GCN-ISEL-NEXT: successors: %bb.1(0x80000000)
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
+; GCN-ISEL-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY23]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub1
+; GCN-ISEL-NEXT: [[V_CVT_F32_U32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY24]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 1333788672
+; GCN-ISEL-NEXT: [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_CVT_F32_U32_e64_1]], 0, killed [[S_MOV_B32_5]], 0, killed [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 1602224124
+; GCN-ISEL-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_6]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 796917760
+; GCN-ISEL-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MUL_F32_e64_]], 0, killed [[S_MOV_B32_7]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 -813694976
+; GCN-ISEL-NEXT: [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, killed [[S_MOV_B32_8]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[S_MOV_B64_2]], [[COPY9]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub1
+; GCN-ISEL-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_2:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[COPY26]]
+; GCN-ISEL-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub0
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[V_CVT_U32_F32_e64_]], implicit $exec
+; GCN-ISEL-NEXT: [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+; GCN-ISEL-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_3:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY28]]
+; GCN-ISEL-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_2]]
+; GCN-ISEL-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY29]], killed [[S_MUL_I32_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_3]], killed [[S_MUL_I32_2]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_ADD_I32_4]], implicit $exec
+; GCN-ISEL-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_4:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY30]], [[S_ADD_I32_4]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_4]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_3]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_5:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY31]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_MUL_I32_5]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_4]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE6]], killed [[REG_SEQUENCE5]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub0
+; GCN-ISEL-NEXT: [[COPY33:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub1
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_ADD_I32_4]], implicit $exec
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_MUL_I32_5]], implicit $exec
+; GCN-ISEL-NEXT: [[COPY34:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_6:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY34]], [[S_MUL_I32_5]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_6]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_6]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY35:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub0
+; GCN-ISEL-NEXT: [[COPY36:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub1
+; GCN-ISEL-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+; GCN-ISEL-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY32]], killed [[COPY35]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY33]], killed [[COPY36]], implicit-def $scc, implicit $scc
+; GCN-ISEL-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_5]]
+; GCN-ISEL-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY37]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
+; GCN-ISEL-NEXT: [[COPY38:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT: [[S_MUL_I32_7:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY38]], [[S_ADD_I32_4]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_7]], %subreg.sub0, killed [[S_ADDC_U32_1]], %subreg.sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_]], %subreg.sub0, killed [[S_ADDC_U32_]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY39:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE9]].sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY39]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U1:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE10]], killed [[REG_SEQUENCE8]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY40:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub0
+; GCN-ISEL-NEXT: [[COPY41:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
+; GCN-ISEL-NEXT: [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[COPY41]], killed [[COPY40]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY42:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub1
+; GCN-ISEL-NEXT: [[COPY43:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
+; GCN-ISEL-NEXT: [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[COPY43]], killed [[COPY42]], killed [[S_UADDO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_MUL_I32_8:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_ADD_C]]
+; GCN-ISEL-NEXT: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[COPY44]], implicit $exec
+; GCN-ISEL-NEXT: [[COPY45:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_7]]
+; GCN-ISEL-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY45]], killed [[S_MUL_I32_8]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_MUL_I32_9:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[S_UADDO]]
+; GCN-ISEL-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_5]], killed [[S_MUL_I32_9]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY46]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MUL_I32_10:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_UADDO]]
+; GCN-ISEL-NEXT: [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY47]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MUL_I32_11:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_MUL_I32_10]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE11:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_11]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_9]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY48:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub0
+; GCN-ISEL-NEXT: [[COPY49:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub1
+; GCN-ISEL-NEXT: [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY50]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MUL_I32_12:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_UADDO]], [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE12:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_12]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_10]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_11:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY51]], implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE13:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_11]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U2:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE13]], killed [[REG_SEQUENCE12]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY52:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub0
+; GCN-ISEL-NEXT: [[COPY53:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub1
+; GCN-ISEL-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY52]], killed [[COPY48]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY53]], killed [[COPY49]], implicit-def $scc, implicit $scc
+; GCN-ISEL-NEXT: [[COPY54:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_8]]
+; GCN-ISEL-NEXT: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY54]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
+; GCN-ISEL-NEXT: [[S_MUL_I32_13:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_ADD_I32_6]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE14:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_13]], %subreg.sub0, killed [[S_ADDC_U32_3]], %subreg.sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE15:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_1]], %subreg.sub0, killed [[S_ADDC_U32_2]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY55:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE15]].sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE16:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY55]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U3:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE16]], killed [[REG_SEQUENCE14]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY56:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub0
+; GCN-ISEL-NEXT: [[S_UADDO2:%[0-9]+]]:sreg_32, [[S_UADDO3:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[S_UADDO]], killed [[COPY56]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY57:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub1
+; GCN-ISEL-NEXT: [[S_ADD_C2:%[0-9]+]]:sreg_32, [[S_ADD_C3:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[S_ADD_C]], killed [[COPY57]], killed [[S_UADDO3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY58:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
+; GCN-ISEL-NEXT: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_12:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY59]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MUL_I32_14:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY58]], [[S_ADD_C2]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE17:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_14]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_12]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_13:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY60]], implicit $exec
+; GCN-ISEL-NEXT: [[REG_SEQUENCE18:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_13]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U4:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE18]], killed [[REG_SEQUENCE17]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY61:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub0
+; GCN-ISEL-NEXT: [[COPY62:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub1
+; GCN-ISEL-NEXT: [[COPY63:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
+; GCN-ISEL-NEXT: [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_14:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY64]], implicit $exec
+; GCN-ISEL-NEXT: [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_15:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY65]], implicit $exec
+; GCN-ISEL-NEXT: [[S_MUL_I32_15:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_UADDO2]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE19:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_15]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_15]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY66:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub0
+; GCN-ISEL-NEXT: [[COPY67:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub1
+; GCN-ISEL-NEXT: [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY61]], killed [[COPY66]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY62]], killed [[COPY67]], implicit-def $scc, implicit $scc
+; GCN-ISEL-NEXT: [[COPY68:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_14]]
+; GCN-ISEL-NEXT: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY68]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
+; GCN-ISEL-NEXT: [[S_MUL_I32_16:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_ADD_C2]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE20:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_16]], %subreg.sub0, killed [[S_ADDC_U32_5]], %subreg.sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE21:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_2]], %subreg.sub0, killed [[S_ADDC_U32_4]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY69:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE21]].sub1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE22:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY69]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_ADD_U5:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE22]], killed [[REG_SEQUENCE20]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY70:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub1
+; GCN-ISEL-NEXT: [[S_MUL_I32_17:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY70]]
+; GCN-ISEL-NEXT: [[COPY71:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub0
+; GCN-ISEL-NEXT: [[COPY72:%[0-9]+]]:vgpr_32 = COPY [[COPY71]]
+; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_16:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY23]], [[COPY72]], implicit $exec
+; GCN-ISEL-NEXT: [[COPY73:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_16]]
+; GCN-ISEL-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY73]], killed [[S_MUL_I32_17]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_MUL_I32_18:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY24]], [[COPY71]]
+; GCN-ISEL-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_7]], killed [[S_MUL_I32_18]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_SUB_I32_3:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY63]], [[S_ADD_I32_8]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_MUL_I32_19:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY71]]
+; GCN-ISEL-NEXT: [[S_USUBO:%[0-9]+]]:sreg_32, [[S_USUBO1:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[COPY58]], killed [[S_MUL_I32_19]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_SUB_C:%[0-9]+]]:sreg_32, [[S_SUB_C1:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_I32_3]], [[COPY24]], [[S_USUBO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_USUBO2:%[0-9]+]]:sreg_32, [[S_USUBO3:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[S_USUBO]], [[COPY23]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[S_SUB_C2:%[0-9]+]]:sreg_32, [[S_SUB_C3:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_C]], [[S_MOV_B32_10]], killed [[S_USUBO3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_3:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT: S_CMP_GE_U32 killed [[S_USUBO2]], [[COPY23]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_4:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT: S_CMP_EQ_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_5:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_4]], killed [[S_CSELECT_B32_3]], implicit $scc
+; GCN-ISEL-NEXT: [[COPY74:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_5]]
+; GCN-ISEL-NEXT: [[REG_SEQUENCE23:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY71]], %subreg.sub0, [[COPY70]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+; GCN-ISEL-NEXT: [[S_ADD_U6:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_3]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY75:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B64_4:%[0-9]+]]:sreg_64 = S_MOV_B64 2
+; GCN-ISEL-NEXT: [[S_ADD_U7:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_4]], implicit-def dead $scc
+; GCN-ISEL-NEXT: [[COPY76:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub0
+; GCN-ISEL-NEXT: S_CMP_LG_U32 killed [[COPY74]], [[S_MOV_B32_10]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_6:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY76]], killed [[COPY75]], implicit $scc
+; GCN-ISEL-NEXT: [[COPY77:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub1
+; GCN-ISEL-NEXT: [[COPY78:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub1
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_7:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY78]], killed [[COPY77]], implicit $scc
+; GCN-ISEL-NEXT: [[S_SUB_C4:%[0-9]+]]:sreg_32, [[S_SUB_C5:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO [[COPY63]], [[S_ADD_I32_8]], [[S_USUBO1]], implicit-def dead $scc
+; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_8:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_USUBO]], [[COPY23]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_9:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
+; GCN-ISEL-NEXT: S_CMP_EQ_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_10:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_9]], killed [[S_CSELECT_B32_8]], implicit $scc
+; GCN-ISEL-NEXT: [[COPY79:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_10]]
+; GCN-ISEL-NEXT: S_CMP_LG_U32 killed [[COPY79]], [[S_MOV_B32_10]], implicit-def $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_11:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_7]], [[COPY70]], implicit $scc
+; GCN-ISEL-NEXT: [[S_CSELECT_B32_12:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_6]], [[COPY71]], implicit $scc
+; GCN-ISEL-NEXT: [[REG_SEQUENCE24:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_12]], %subreg.sub0, killed [[S_CSELECT_B32_11]], %subreg.sub1
+; GCN-ISEL-NEXT: [[S_MOV_B64_5:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+; GCN-ISEL-NEXT: [[COPY80:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE24]]
+; GCN-ISEL-NEXT: S_BRANCH %bb.1
+; GCN-ISEL-NEXT: {{ $}}
+; GCN-ISEL-NEXT: bb.4 (%ir-block.14):
+; GCN-ISEL-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[PHI]], %bb.1, [[COPY22]], %bb.2
+; GCN-ISEL-NEXT: [[COPY81:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
+; GCN-ISEL-NEXT: [[COPY82:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
+; GCN-ISEL-NEXT: [[REG_SEQUENCE25:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY82]], %subreg.sub0, killed [[COPY81]], %subreg.sub1
+; GCN-ISEL-NEXT: [[COPY83:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub1
+; GCN-ISEL-NEXT: [[COPY84:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub0
+; GCN-ISEL-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+; GCN-ISEL-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+; GCN-ISEL-NEXT: [[REG_SEQUENCE26:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY84]], %subreg.sub0, killed [[COPY83]], %subreg.sub1, killed [[S_MOV_B32_13]], %subreg.sub2, killed [[S_MOV_B32_12]], %subreg.sub3
+; GCN-ISEL-NEXT: [[COPY85:%[0-9]+]]:vreg_64 = COPY [[PHI2]]
+; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET [[COPY85]], killed [[REG_SEQUENCE26]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.16, addrspace 1)
+; GCN-ISEL-NEXT: S_ENDPGM 0
%result = udiv i64 %x, %y
store i64 %result, ptr addrspace(1) %out
ret void
@@ -3225,5 +3932,3 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN-ISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 8a54ad301f48a..7f3c0c0c8605e 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -14,13 +14,15 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_sub_u32 s2, s2, s8
-; SI-NEXT: s_subb_u32 s3, s3, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
+; SI-NEXT: s_sub_u32 s0, s2, s8
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
+; SI-NEXT: s_subb_u32 s1, s3, s9
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -31,13 +33,15 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s2, s2, s4
-; VI-NEXT: s_subb_u32 s3, s3, s5
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: s_sub_u32 s0, s2, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT: s_subb_u32 s1, s3, s5
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -48,10 +52,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
; GFX9-NEXT: s_sub_u32 s4, s2, s6
; GFX9-NEXT: s_subb_u32 s5, s3, s7
-; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -65,9 +71,9 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_cmp_gt_u64_e64 s4, s[2:3], s[6:7]
; GFX10-NEXT: s_sub_u32 s2, s2, s6
; GFX10-NEXT: s_subb_u32 s3, s3, s7
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v0, s2, s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
@@ -81,13 +87,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[2:3], s[4:5]
; GFX11-NEXT: s_sub_u32 s2, s2, s4
; GFX11-NEXT: s_subb_u32 s3, s3, s5
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -429,20 +434,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_u32 s4, s4, s6
-; SI-NEXT: s_subb_u32 s5, s5, s7
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: s_sub_u32 s6, s4, s6
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; SI-NEXT: s_subb_u32 s7, s5, s7
+; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_mov_b32 s0, s2
; SI-NEXT: s_mov_b32 s1, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -451,16 +457,18 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
@@ -469,12 +477,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s14
+; GFX9-NEXT: v_mov_b32_e32 v1, s15
; GFX9-NEXT: s_sub_u32 s0, s12, s14
; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: global_store_byte v2, v3, s[10:11]
; GFX9-NEXT: s_endpgm
@@ -487,8 +497,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: s_sub_u32 s0, s12, s14
; GFX10-NEXT: s_subb_u32 s1, s13, s15
; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
@@ -498,11 +508,11 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_subb_u32 s5, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: s_sub_u32 s8, s4, s6
+; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
+; GFX11-NEXT: s_subb_u32 s9, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -538,11 +548,11 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -561,59 +571,64 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_subb_u32_e32 v9, vcc, v1, v3, vcc
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
; VI-NEXT: flat_store_byte v[6:7], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_usubo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: global_store_byte v6, v0, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_usubo_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
-; GFX10-NEXT: global_store_byte v4, v2, s[10:11]
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX10-NEXT: global_store_byte v6, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_usubo_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v4, v2, s[2:3]
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1]
+; GFX11-NEXT: global_store_b8 v6, v0, s[2:3]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -675,11 +690,11 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v5, vcc, v4, v5
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v5
+; VI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; VI-NEXT: v_cmp_gt_u32_e32 vcc, v6, v4
-; VI-NEXT: flat_store_short v[0:1], v5
+; VI-NEXT: v_cmp_gt_u32_e32 vcc, v4, v5
+; VI-NEXT: flat_store_short v[0:1], v6
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
@@ -692,10 +707,10 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v2, v1, v2
-; GFX9-NEXT: v_cmp_gt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT: v_sub_u32_e32 v3, v1, v2
+; GFX9-NEXT: v_cmp_gt_u32_sdwa s[0:1], v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_short v0, v2, s[8:9]
+; GFX9-NEXT: global_store_short v0, v3, s[8:9]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
;
@@ -708,10 +723,10 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, v1, v2
-; GFX10-NEXT: v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_cmp_gt_u32_sdwa s0, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, v1, v2
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: global_store_short v0, v2, s[8:9]
+; GFX10-NEXT: global_store_short v0, v3, s[8:9]
; GFX10-NEXT: global_store_byte v0, v1, s[10:11]
; GFX10-NEXT: s_endpgm
;
@@ -721,18 +736,19 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[6:7]
+; GFX11-NEXT: global_load_d16_b16 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, v1, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 3ddb2f02c48fe..8bd4073e35c74 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -730,38 +730,52 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_usubsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_usubsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_usubsat_i64:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubsat_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
}
diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index 5ebb115791c66..e5789de4ca415 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -49,24 +49,24 @@ define <2 x i1> @usubo(ptr %ptr, ptr %ptr2) {
; CHECK-LABEL: usubo:
; CHECK: @ %bb.0:
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT: vsub.i64 q8, q9, q8
-; CHECK-NEXT: vmov lr, r12, d18
-; CHECK-NEXT: vmov r4, r5, d19
-; CHECK-NEXT: vmov r3, r2, d16
-; CHECK-NEXT: vmov r6, r7, d17
-; CHECK-NEXT: subs.w r3, lr, r3
-; CHECK-NEXT: sbcs.w r2, r12, r2
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vmov r3, r2, d18
+; CHECK-NEXT: vmov r6, r7, d19
+; CHECK-NEXT: vmov lr, r12, d16
+; CHECK-NEXT: vmov r4, r5, d17
+; CHECK-NEXT: vsub.i64 q8, q8, q9
+; CHECK-NEXT: subs.w r3, r3, lr
+; CHECK-NEXT: sbcs.w r2, r2, r12
; CHECK-NEXT: mov.w r2, #0
; CHECK-NEXT: it lo
; CHECK-NEXT: movlo r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r2, #-1
-; CHECK-NEXT: subs r3, r4, r6
-; CHECK-NEXT: sbcs.w r3, r5, r7
+; CHECK-NEXT: subs r3, r6, r4
+; CHECK-NEXT: sbcs.w r3, r7, r5
; CHECK-NEXT: it lo
; CHECK-NEXT: movlo r1, #1
; CHECK-NEXT: cmp r1, #0
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
index ab09cc9ed50a0..81628a15915a2 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-cond-sub-clamp.ll
@@ -203,9 +203,9 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; LA64-NEXT: move $a5, $a4
; LA64-NEXT: srl.w $a4, $a4, $a2
; LA64-NEXT: andi $a4, $a4, 255
-; LA64-NEXT: sub.d $a6, $a4, $a1
-; LA64-NEXT: sltu $a4, $a4, $a6
-; LA64-NEXT: masknez $a4, $a6, $a4
+; LA64-NEXT: sltu $a6, $a1, $a4
+; LA64-NEXT: sub.d $a4, $a4, $a1
+; LA64-NEXT: masknez $a4, $a4, $a6
; LA64-NEXT: sll.w $a4, $a4, $a2
; LA64-NEXT: and $a6, $a5, $a3
; LA64-NEXT: or $a6, $a6, $a4
@@ -252,9 +252,9 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; LA64-NEXT: move $a5, $a4
; LA64-NEXT: srl.w $a4, $a4, $a2
; LA64-NEXT: bstrpick.d $a4, $a4, 15, 0
-; LA64-NEXT: sub.d $a6, $a4, $a1
-; LA64-NEXT: sltu $a4, $a4, $a6
-; LA64-NEXT: masknez $a4, $a6, $a4
+; LA64-NEXT: sltu $a6, $a1, $a4
+; LA64-NEXT: sub.d $a4, $a4, $a1
+; LA64-NEXT: masknez $a4, $a4, $a6
; LA64-NEXT: sll.w $a4, $a4, $a2
; LA64-NEXT: and $a6, $a5, $a3
; LA64-NEXT: or $a6, $a6, $a4
@@ -292,9 +292,9 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; LA64-NEXT: # =>This Loop Header: Depth=1
; LA64-NEXT: # Child Loop BB6_3 Depth 2
; LA64-NEXT: move $a3, $a2
-; LA64-NEXT: sub.d $a2, $a2, $a1
-; LA64-NEXT: sltu $a4, $a3, $a2
-; LA64-NEXT: masknez $a4, $a2, $a4
+; LA64-NEXT: sltu $a2, $a1, $a2
+; LA64-NEXT: sub.d $a4, $a3, $a1
+; LA64-NEXT: masknez $a4, $a4, $a2
; LA64-NEXT: .LBB6_3: # %atomicrmw.start
; LA64-NEXT: # Parent Loop BB6_1 Depth=1
; LA64-NEXT: # => This Inner Loop Header: Depth=2
@@ -328,9 +328,9 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; LA64-NEXT: # =>This Loop Header: Depth=1
; LA64-NEXT: # Child Loop BB7_3 Depth 2
; LA64-NEXT: move $a3, $a2
-; LA64-NEXT: sub.d $a2, $a2, $a1
-; LA64-NEXT: sltu $a4, $a3, $a2
-; LA64-NEXT: masknez $a4, $a2, $a4
+; LA64-NEXT: sltu $a2, $a1, $a2
+; LA64-NEXT: sub.d $a4, $a3, $a1
+; LA64-NEXT: masknez $a4, $a4, $a2
; LA64-NEXT: .LBB7_3: # %atomicrmw.start
; LA64-NEXT: # Parent Loop BB7_1 Depth=1
; LA64-NEXT: # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
index ff176c80ab342..8e22b4eb6bb9d 100644
--- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
@@ -218,16 +218,15 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; CHECK-NEXT: # Child Loop BB4_4 Depth 2
; CHECK-NEXT: srw 8, 7, 3
; CHECK-NEXT: clrlwi 9, 8, 24
-; CHECK-NEXT: sub 8, 9, 4
-; CHECK-NEXT: cmplw 8, 9
-; CHECK-NEXT: li 9, 0
+; CHECK-NEXT: cmplw 9, 4
+; CHECK-NEXT: li 8, 0
; CHECK-NEXT: bgt 0, .LBB4_3
; CHECK-NEXT: # %bb.2: # %atomicrmw.start
; CHECK-NEXT: #
-; CHECK-NEXT: mr 9, 8
+; CHECK-NEXT: sub 8, 9, 4
; CHECK-NEXT: .LBB4_3: # %atomicrmw.start
; CHECK-NEXT: #
-; CHECK-NEXT: slw 8, 9, 3
+; CHECK-NEXT: slw 8, 8, 3
; CHECK-NEXT: and 9, 7, 6
; CHECK-NEXT: or 9, 9, 8
; CHECK-NEXT: .LBB4_4: # %cmpxchg.start
@@ -277,16 +276,15 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; CHECK-NEXT: # Child Loop BB5_4 Depth 2
; CHECK-NEXT: srw 8, 7, 3
; CHECK-NEXT: clrlwi 9, 8, 16
-; CHECK-NEXT: sub 8, 9, 4
-; CHECK-NEXT: cmplw 8, 9
-; CHECK-NEXT: li 9, 0
+; CHECK-NEXT: cmplw 9, 4
+; CHECK-NEXT: li 8, 0
; CHECK-NEXT: bgt 0, .LBB5_3
; CHECK-NEXT: # %bb.2: # %atomicrmw.start
; CHECK-NEXT: #
-; CHECK-NEXT: mr 9, 8
+; CHECK-NEXT: sub 8, 9, 4
; CHECK-NEXT: .LBB5_3: # %atomicrmw.start
; CHECK-NEXT: #
-; CHECK-NEXT: slw 8, 9, 3
+; CHECK-NEXT: slw 8, 8, 3
; CHECK-NEXT: and 9, 7, 6
; CHECK-NEXT: or 9, 9, 8
; CHECK-NEXT: .LBB5_4: # %cmpxchg.start
@@ -325,13 +323,12 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; CHECK-NEXT: .LBB6_1: # %atomicrmw.start
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB6_3 Depth 2
-; CHECK-NEXT: sub 5, 6, 4
-; CHECK-NEXT: cmplw 5, 6
+; CHECK-NEXT: cmplw 6, 4
; CHECK-NEXT: li 7, 0
; CHECK-NEXT: bgt 0, .LBB6_3
; CHECK-NEXT: # %bb.2: # %atomicrmw.start
; CHECK-NEXT: #
-; CHECK-NEXT: mr 7, 5
+; CHECK-NEXT: sub 7, 6, 4
; CHECK-NEXT: .LBB6_3: # %cmpxchg.start
; CHECK-NEXT: # Parent Loop BB6_1 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/RISCV/pr170634.ll b/llvm/test/CodeGen/RISCV/pr170634.ll
new file mode 100644
index 0000000000000..52f011e167e0f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr170634.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv64 -O1 < %s | FileCheck %s
+
+declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64)
+
+; Test that usub.with.overflow generates efficient code without extra mv instruction
+define i64 @test_usubo_no_extra_mv(i64 %x, i64 %y) {
+; CHECK-LABEL: test_usubo_no_extra_mv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bltu a1, a0, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: li a0, 291
+; CHECK-NEXT: ret
+ %res = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
+ %val = extractvalue { i64, i1 } %res, 0
+ %ovf = extractvalue { i64, i1 } %res, 1
+ %ret = select i1 %ovf, i64 291, i64 %val
+ ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index bbc0ff9bd1be5..1e537fe64c08d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -265,28 +265,28 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @usub_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
; CHECK-LABEL: usub_int64_t:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov r0, r1, d3
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: vmov r0, r8, d3
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: subs r5, r2, r0
-; CHECK-NEXT: sbc.w lr, r3, r1
-; CHECK-NEXT: subs r2, r2, r5
-; CHECK-NEXT: sbcs.w r2, r3, lr
-; CHECK-NEXT: vmov r3, r12, d2
-; CHECK-NEXT: vmov r1, r4, d0
-; CHECK-NEXT: csetm r2, lo
-; CHECK-NEXT: subs r3, r1, r3
-; CHECK-NEXT: sbc.w r0, r4, r12
-; CHECK-NEXT: subs r1, r1, r3
-; CHECK-NEXT: sbcs.w r1, r4, r0
-; CHECK-NEXT: vmov q1[2], q1[0], r3, r5
+; CHECK-NEXT: vmov r4, r5, d2
+; CHECK-NEXT: vmov r6, r7, d0
+; CHECK-NEXT: subs.w r12, r2, r0
+; CHECK-NEXT: sbc.w lr, r3, r8
+; CHECK-NEXT: subs r1, r6, r4
+; CHECK-NEXT: vmov q0[2], q0[0], r1, r12
+; CHECK-NEXT: sbc.w r1, r7, r5
+; CHECK-NEXT: subs r0, r0, r2
+; CHECK-NEXT: vmov q0[3], q0[1], r1, lr
+; CHECK-NEXT: sbcs.w r0, r8, r3
+; CHECK-NEXT: csetm r0, lo
+; CHECK-NEXT: subs r1, r4, r6
+; CHECK-NEXT: sbcs.w r1, r5, r7
; CHECK-NEXT: csetm r1, lo
-; CHECK-NEXT: vmov q1[3], q1[1], r0, lr
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r2
-; CHECK-NEXT: vbic q0, q1, q0
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
+; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT: vbic q0, q0, q1
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
ret <2 x i64> %0
diff --git a/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
index 860c4004658db..ad9e8a69cc81f 100644
--- a/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/VE/Scalar/atomicrmw-cond-sub-clamp.ll
@@ -138,10 +138,10 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; CHECK-NEXT: and %s4, %s5, (32)0
; CHECK-NEXT: srl %s4, %s4, %s0
; CHECK-NEXT: and %s4, %s4, (56)0
-; CHECK-NEXT: subs.w.sx %s6, %s4, %s3
-; CHECK-NEXT: cmpu.w %s4, %s6, %s4
-; CHECK-NEXT: cmov.w.gt %s6, (0)1, %s4
-; CHECK-NEXT: sla.w.sx %s4, %s6, %s0
+; CHECK-NEXT: cmpu.w %s6, %s4, %s3
+; CHECK-NEXT: subs.w.sx %s4, %s4, %s3
+; CHECK-NEXT: cmov.w.gt %s4, (0)1, %s6
+; CHECK-NEXT: sla.w.sx %s4, %s4, %s0
; CHECK-NEXT: and %s6, %s5, %s2
; CHECK-NEXT: or %s4, %s6, %s4
; CHECK-NEXT: cas.w %s4, (%s1), %s5
@@ -174,10 +174,10 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; CHECK-NEXT: and %s4, %s5, (32)0
; CHECK-NEXT: srl %s4, %s4, %s0
; CHECK-NEXT: and %s4, %s4, (48)0
-; CHECK-NEXT: subs.w.sx %s6, %s4, %s3
-; CHECK-NEXT: cmpu.w %s4, %s6, %s4
-; CHECK-NEXT: cmov.w.gt %s6, (0)1, %s4
-; CHECK-NEXT: sla.w.sx %s4, %s6, %s0
+; CHECK-NEXT: cmpu.w %s6, %s4, %s3
+; CHECK-NEXT: subs.w.sx %s4, %s4, %s3
+; CHECK-NEXT: cmov.w.gt %s4, (0)1, %s6
+; CHECK-NEXT: sla.w.sx %s4, %s4, %s0
; CHECK-NEXT: and %s6, %s5, %s2
; CHECK-NEXT: or %s4, %s6, %s4
; CHECK-NEXT: cas.w %s4, (%s1), %s5
@@ -200,8 +200,8 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; CHECK-NEXT: .LBB6_1: # %atomicrmw.start
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: or %s3, 0, %s2
+; CHECK-NEXT: cmpu.w %s4, %s2, %s1
; CHECK-NEXT: subs.w.sx %s2, %s2, %s1
-; CHECK-NEXT: cmpu.w %s4, %s2, %s3
; CHECK-NEXT: cmov.w.gt %s2, (0)1, %s4
; CHECK-NEXT: cas.w %s2, (%s0), %s3
; CHECK-NEXT: brne.w %s2, %s3, .LBB6_1
@@ -222,7 +222,7 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: or %s3, 0, %s2
; CHECK-NEXT: subs.l %s2, %s2, %s1
-; CHECK-NEXT: cmpu.l %s4, %s2, %s3
+; CHECK-NEXT: cmpu.l %s4, %s3, %s1
; CHECK-NEXT: cmov.l.gt %s2, (0)1, %s4
; CHECK-NEXT: cas.l %s2, (%s0), %s3
; CHECK-NEXT: brne.l %s2, %s3, .LBB7_1
diff --git a/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
index 3355237425b42..e3c5da02ab16b 100644
--- a/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/WebAssembly/atomicrmw-cond-sub-clamp.ll
@@ -189,12 +189,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; WASM32-NEXT: i32.load8_u 0
; WASM32-NEXT: local.tee 2
; WASM32-NEXT: local.get 1
-; WASM32-NEXT: i32.const 255
-; WASM32-NEXT: i32.and
; WASM32-NEXT: i32.sub
-; WASM32-NEXT: local.tee 1
-; WASM32-NEXT: local.get 1
; WASM32-NEXT: local.get 2
+; WASM32-NEXT: local.get 1
+; WASM32-NEXT: i32.const 255
+; WASM32-NEXT: i32.and
; WASM32-NEXT: i32.gt_u
; WASM32-NEXT: i32.select
; WASM32-NEXT: i32.store8 0
@@ -211,12 +210,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
; WASM64-NEXT: i32.load8_u 0
; WASM64-NEXT: local.tee 2
; WASM64-NEXT: local.get 1
-; WASM64-NEXT: i32.const 255
-; WASM64-NEXT: i32.and
; WASM64-NEXT: i32.sub
-; WASM64-NEXT: local.tee 1
-; WASM64-NEXT: local.get 1
; WASM64-NEXT: local.get 2
+; WASM64-NEXT: local.get 1
+; WASM64-NEXT: i32.const 255
+; WASM64-NEXT: i32.and
; WASM64-NEXT: i32.gt_u
; WASM64-NEXT: i32.select
; WASM64-NEXT: i32.store8 0
@@ -237,12 +235,11 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; WASM32-NEXT: i32.load16_u 0
; WASM32-NEXT: local.tee 2
; WASM32-NEXT: local.get 1
-; WASM32-NEXT: i32.const 65535
-; WASM32-NEXT: i32.and
; WASM32-NEXT: i32.sub
-; WASM32-NEXT: local.tee 1
-; WASM32-NEXT: local.get 1
; WASM32-NEXT: local.get 2
+; WASM32-NEXT: local.get 1
+; WASM32-NEXT: i32.const 65535
+; WASM32-NEXT: i32.and
; WASM32-NEXT: i32.gt_u
; WASM32-NEXT: i32.select
; WASM32-NEXT: i32.store16 0
@@ -259,12 +256,11 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
; WASM64-NEXT: i32.load16_u 0
; WASM64-NEXT: local.tee 2
; WASM64-NEXT: local.get 1
-; WASM64-NEXT: i32.const 65535
-; WASM64-NEXT: i32.and
; WASM64-NEXT: i32.sub
-; WASM64-NEXT: local.tee 1
-; WASM64-NEXT: local.get 1
; WASM64-NEXT: local.get 2
+; WASM64-NEXT: local.get 1
+; WASM64-NEXT: i32.const 65535
+; WASM64-NEXT: i32.and
; WASM64-NEXT: i32.gt_u
; WASM64-NEXT: i32.select
; WASM64-NEXT: i32.store16 0
@@ -286,9 +282,8 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; WASM32-NEXT: local.tee 2
; WASM32-NEXT: local.get 1
; WASM32-NEXT: i32.sub
-; WASM32-NEXT: local.tee 1
-; WASM32-NEXT: local.get 1
; WASM32-NEXT: local.get 2
+; WASM32-NEXT: local.get 1
; WASM32-NEXT: i32.gt_u
; WASM32-NEXT: i32.select
; WASM32-NEXT: i32.store 0
@@ -306,9 +301,8 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
; WASM64-NEXT: local.tee 2
; WASM64-NEXT: local.get 1
; WASM64-NEXT: i32.sub
-; WASM64-NEXT: local.tee 1
-; WASM64-NEXT: local.get 1
; WASM64-NEXT: local.get 2
+; WASM64-NEXT: local.get 1
; WASM64-NEXT: i32.gt_u
; WASM64-NEXT: i32.select
; WASM64-NEXT: i32.store 0
@@ -330,9 +324,8 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; WASM32-NEXT: local.tee 2
; WASM32-NEXT: local.get 1
; WASM32-NEXT: i64.sub
-; WASM32-NEXT: local.tee 1
-; WASM32-NEXT: local.get 1
; WASM32-NEXT: local.get 2
+; WASM32-NEXT: local.get 1
; WASM32-NEXT: i64.gt_u
; WASM32-NEXT: i64.select
; WASM32-NEXT: i64.store 0
@@ -350,9 +343,8 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
; WASM64-NEXT: local.tee 2
; WASM64-NEXT: local.get 1
; WASM64-NEXT: i64.sub
-; WASM64-NEXT: local.tee 1
-; WASM64-NEXT: local.get 1
; WASM64-NEXT: local.get 2
+; WASM64-NEXT: local.get 1
; WASM64-NEXT: i64.gt_u
; WASM64-NEXT: i64.select
; WASM64-NEXT: i64.store 0
diff --git a/llvm/test/CodeGen/X86/combine-addo.ll b/llvm/test/CodeGen/X86/combine-addo.ll
index ba748b6e653cf..878dee6b2921b 100644
--- a/llvm/test/CodeGen/X86/combine-addo.ll
+++ b/llvm/test/CodeGen/X86/combine-addo.ll
@@ -75,23 +75,12 @@ define i32 @combine_uadd_not(i32 %a0, i32 %a1) {
define <4 x i32> @combine_vec_uadd_not(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_vec_uadd_not:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: psubd %xmm0, %xmm2
-; SSE-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1]
-; SSE-NEXT: pmaxud %xmm2, %xmm0
-; SSE-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_uadd_not:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX-NEXT: vpmaxud %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = xor <4 x i32> %a0, <i32 -1, i32 -1, i32 -1, i32 -1>
%2 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll
index 5e4bba6e0fd35..f336b08318373 100644
--- a/llvm/test/CodeGen/X86/combine-subo.ll
+++ b/llvm/test/CodeGen/X86/combine-subo.ll
@@ -202,13 +202,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
; SSE-LABEL: always_usub_const_vector:
; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: always_usub_const_vector:
; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: retq
%x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 0, i8 0, i8 0, i8 0>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>)
ret { <4 x i8>, <4 x i1> } %x
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index ceb1ad13bc153..4436732f8d86c 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -43,11 +43,11 @@ define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: usubo_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -55,43 +55,45 @@ define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
; SSSE3-LABEL: usubo_v2i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: psubd %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm0
; SSSE3-NEXT: movq %xmm0, (%rdi)
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movq %xmm2, (%rdi)
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pminud %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: usubo_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm1, (%rdi)
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovq %xmm0, (%rdi)
+; AVX-NEXT: vmovdqa %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: usubo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vmovq %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -105,11 +107,11 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: usubo_v3i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
@@ -119,11 +121,11 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; SSSE3-LABEL: usubo_v3i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: psubd %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm0
; SSSE3-NEXT: movq %xmm0, (%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSSE3-NEXT: movd %xmm0, 8(%rdi)
@@ -132,35 +134,37 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
;
; SSE41-LABEL: usubo_v3i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pextrd $2, %xmm2, 8(%rdi)
-; SSE41-NEXT: movq %xmm2, (%rdi)
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pminud %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: usubo_v3i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi)
-; AVX-NEXT: vmovq %xmm1, (%rdi)
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
+; AVX-NEXT: vmovq %xmm0, (%rdi)
+; AVX-NEXT: vmovdqa %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: usubo_v3i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm2, 8(%rdi)
+; AVX512-NEXT: vmovq %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
%val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -174,11 +178,11 @@ define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: usubo_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -186,43 +190,45 @@ define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
; SSSE3-LABEL: usubo_v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: psubd %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pminud %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: usubo_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX-NEXT: vmovdqa %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: usubo_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
%val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
@@ -236,13 +242,6 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: usubo_v6i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %rax
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; SSE2-NEXT: movd %r8d, %xmm0
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -253,37 +252,37 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE2-NEXT: movd %r9d, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psubd %xmm3, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm4, (%rcx)
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: movq %xmm0, 16(%rcx)
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: movq %xmm0, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psubd %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT: psubd %xmm4, %xmm1
+; SSE2-NEXT: movq %xmm1, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
+; SSE2-NEXT: movq %xmm2, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm5, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: usubo_v6i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %rax
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; SSSE3-NEXT: movd %r8d, %xmm0
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -294,25 +293,32 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSSE3-NEXT: movd %r9d, %xmm1
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: psubd %xmm3, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm4, (%rcx)
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: psubd %xmm2, %xmm0
-; SSSE3-NEXT: movq %xmm0, 16(%rcx)
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: movq %xmm0, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: psubd %xmm3, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: psubd %xmm4, %xmm1
+; SSSE3-NEXT: movq %xmm1, 16(%rcx)
+; SSSE3-NEXT: movdqa %xmm0, (%rcx)
+; SSSE3-NEXT: movq %xmm2, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm5, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v6i32:
@@ -332,61 +338,63 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psubd %xmm3, %xmm4
-; SSE41-NEXT: pminud %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: psubd %xmm1, %xmm5
-; SSE41-NEXT: pminud %xmm5, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: movq %xmm5, 16(%rcx)
-; SSE41-NEXT: movdqa %xmm4, (%rcx)
-; SSE41-NEXT: movq %xmm2, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: pminud %xmm3, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: pminud %xmm1, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: psubd %xmm3, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: movq %xmm2, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm0, (%rcx)
+; SSE41-NEXT: movq %xmm6, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm4, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: usubo_v6i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpminud %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm1
+; AVX1-NEXT: vmovq %xmm1, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX1-NEXT: vmovaps %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v6i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovq %xmm1, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v6i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512-NEXT: vmovq %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
%val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -401,17 +409,18 @@ define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, (%rdi)
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: psubd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: psubd %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: retq
;
@@ -419,72 +428,77 @@ define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: psubd %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, (%rdi)
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: psubd %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm1, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: psubd %xmm3, %xmm1
; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psubd %xmm2, %xmm4
-; SSE41-NEXT: pminud %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pminud %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT: pxor %xmm6, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: psubd %xmm3, %xmm5
-; SSE41-NEXT: pminud %xmm5, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm5, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: pminud %xmm3, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE41-NEXT: pxor %xmm6, %xmm5
+; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: psubd %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: usubo_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpminud %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX1-NEXT: vmovaps %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v8i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT: vmovdqa %ymm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
%val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
@@ -497,158 +511,167 @@ define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: usubo_v16i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm3, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
; SSE2-NEXT: psubd %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, (%rdi)
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: psubd %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: psubd %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm7, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm9, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
+; SSE2-NEXT: psubd %xmm7, %xmm9
+; SSE2-NEXT: movdqa %xmm9, 48(%rdi)
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: psubd %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm3, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm8
-; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
-; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm5, %xmm2
; SSE2-NEXT: retq
;
; SSSE3-LABEL: usubo_v16i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: movdqa %xmm3, %xmm9
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm8
; SSSE3-NEXT: psubd %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, (%rdi)
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8
; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
; SSSE3-NEXT: psubd %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: pxor %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: psubd %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm9, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
+; SSSE3-NEXT: psubd %xmm7, %xmm9
+; SSSE3-NEXT: movdqa %xmm9, 48(%rdi)
; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: psubd %xmm7, %xmm3
-; SSSE3-NEXT: pxor %xmm3, %xmm8
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8
-; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
-; SSSE3-NEXT: movdqa %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v16i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: psubd %xmm4, %xmm8
-; SSE41-NEXT: pminud %xmm8, %xmm0
+; SSE41-NEXT: pminud %xmm4, %xmm0
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm12, %xmm12
+; SSE41-NEXT: pxor %xmm12, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm9
-; SSE41-NEXT: psubd %xmm5, %xmm9
-; SSE41-NEXT: pminud %xmm9, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: psubd %xmm6, %xmm5
-; SSE41-NEXT: pminud %xmm5, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: psubd %xmm7, %xmm6
-; SSE41-NEXT: pminud %xmm6, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm6, 48(%rdi)
-; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
-; SSE41-NEXT: movdqa %xmm9, 16(%rdi)
+; SSE41-NEXT: pminud %xmm5, %xmm9
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm9
+; SSE41-NEXT: pxor %xmm12, %xmm9
+; SSE41-NEXT: movdqa %xmm2, %xmm10
+; SSE41-NEXT: pminud %xmm6, %xmm10
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm10
+; SSE41-NEXT: pxor %xmm12, %xmm10
+; SSE41-NEXT: movdqa %xmm3, %xmm11
+; SSE41-NEXT: pminud %xmm7, %xmm11
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm11
+; SSE41-NEXT: pxor %xmm12, %xmm11
+; SSE41-NEXT: psubd %xmm4, %xmm8
+; SSE41-NEXT: psubd %xmm5, %xmm1
+; SSE41-NEXT: psubd %xmm6, %xmm2
+; SSE41-NEXT: psubd %xmm7, %xmm3
+; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
+; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE41-NEXT: movdqa %xmm1, 16(%rdi)
; SSE41-NEXT: movdqa %xmm8, (%rdi)
+; SSE41-NEXT: movdqa %xmm9, %xmm1
+; SSE41-NEXT: movdqa %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm11, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: usubo_v16i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
-; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpminud %xmm6, %xmm5, %xmm6
+; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6
+; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7
+; AVX1-NEXT: vpackssdw %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT: vpminud %xmm7, %xmm8, %xmm9
+; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm9
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm10
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm10, %xmm10
+; AVX1-NEXT: vpackssdw %xmm9, %xmm10, %xmm9
+; AVX1-NEXT: vpacksswb %xmm6, %xmm9, %xmm9
+; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10
+; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm9
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
+; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm9, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpacksswb %xmm6, %xmm6, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm10, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm7, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v16i32:
; AVX2: # %bb.0:
+; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm6
; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpminud %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
-; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT: vmovdqa %ymm6, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v16i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpnleud %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
%val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
@@ -663,10 +686,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psubb %xmm1, %xmm4
-; SSE2-NEXT: pminub %xmm4, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pminub %xmm0, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -690,10 +713,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: psubb %xmm1, %xmm4
-; SSSE3-NEXT: pminub %xmm4, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
+; SSSE3-NEXT: pminub %xmm0, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -717,10 +740,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psubb %xmm1, %xmm4
-; SSE41-NEXT: pminub %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE41-NEXT: pminub %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm3
; SSE41-NEXT: pmovsxbd %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -740,8 +763,8 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; AVX1-LABEL: usubo_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpminub %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0
@@ -759,8 +782,8 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; AVX2-LABEL: usubo_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpminub %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0
@@ -771,10 +794,10 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: usubo_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleub %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpnleub %xmm1, %xmm0, %k1
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
%val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -788,11 +811,11 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; SSE2-LABEL: usubo_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: psubw %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
@@ -806,11 +829,11 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; SSSE3-LABEL: usubo_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: psubw %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT: psubw %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
@@ -825,9 +848,9 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubw %xmm1, %xmm2
-; SSE41-NEXT: pminuw %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pminuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
@@ -838,35 +861,35 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
;
; AVX1-LABEL: usubo_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleuw %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
@@ -880,18 +903,18 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE-LABEL: usubo_v2i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm3
; SSE-NEXT: pxor %xmm2, %xmm3
-; SSE-NEXT: psubq %xmm1, %xmm0
; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pcmpeqd %xmm3, %xmm4
; SSE-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3]
-; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: psubq %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,3,3]
+; SSE-NEXT: pand %xmm1, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3]
-; SSE-NEXT: por %xmm4, %xmm1
+; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -900,32 +923,32 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpnleuq %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
diff --git a/llvm/test/Transforms/InstCombine/pr170634.ll b/llvm/test/Transforms/InstCombine/pr170634.ll
deleted file mode 100644
index 3224b8b63afd3..0000000000000
--- a/llvm/test/Transforms/InstCombine/pr170634.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s | FileCheck %s
-define dso_local i64 @func(i64 noundef %x, i64 noundef %y) local_unnamed_addr {
-; CHECK-LABEL: @func(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
-; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
-; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: br label [[RETURN:%.*]]
-; CHECK: if.end:
-; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0
-; CHECK-NEXT: br label [[RETURN]]
-; CHECK: return:
-; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i64 [ 291, [[IF_THEN]] ], [ [[TMP1]], [[IF_END]] ]
-; CHECK-NEXT: ret i64 [[RETVAL_0]]
-;
-entry:
- %0 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %x, i64 %y)
- %1 = extractvalue { i64, i1 } %0, 1
- %2 = extractvalue { i64, i1 } %0, 0
- br i1 %1, label %if.then, label %if.end
-
-if.then: ; preds = %entry
- br label %return
-
-if.end: ; preds = %entry
- br label %return
-
-return: ; preds = %if.end, %if.then
- %retval.0 = phi i64 [ 291, %if.then ], [ %2, %if.end ]
- ret i64 %retval.0
-}
-
>From 28c821c099a89a3c47d56dc6e881c2c4e99c245e Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Mon, 8 Dec 2025 17:38:10 +0530
Subject: [PATCH 6/8] Apply suggestions from code review
Co-authored-by: Jay Foad <jay.foad at gmail.com>
---
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8b46c4c1e66db..ad33f32160f0c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11467,7 +11467,7 @@ void TargetLowering::expandUADDSUBO(
} else {
ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
SDValue CompareLHS = IsAdd ? Result : LHS;
- SDValue CompareRHS = IsAdd ? LHS : RHS;
+ SDValue CompareRHS = RHS;
SetCC = DAG.getSetCC(dl, SetCCType, CompareLHS, CompareRHS, CC);
}
Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
>From 3de9265a40fbf04c03ad9098b5568157eafdd9fa Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Thu, 11 Dec 2025 16:15:46 +0530
Subject: [PATCH 7/8] updated CHECK-NEXT in the testcase
---
llvm/test/CodeGen/AArch64/active_lane_mask.ll | 18 +-
llvm/test/CodeGen/AArch64/vec_uaddo.ll | 72 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 889 +----
llvm/test/CodeGen/AMDGPU/carryout-selection.s | 3547 +++++++++++++++++
llvm/test/CodeGen/AMDGPU/uaddo.ll | 28 +-
llvm/test/CodeGen/ARM/addsubo-legalization.ll | 14 +-
llvm/test/CodeGen/PowerPC/sat-add.ll | 16 +-
llvm/test/CodeGen/RISCV/addcarry.ll | 22 +-
.../test/CodeGen/RISCV/arith-with-overflow.ll | 6 +-
.../test/CodeGen/RISCV/overflow-intrinsics.ll | 18 +-
llvm/test/CodeGen/RISCV/uadd_sat.ll | 16 +-
llvm/test/CodeGen/RISCV/uadd_sat_plus.ll | 16 +-
.../RISCV/umulo-128-legalisation-lowering.ll | 66 +-
llvm/test/CodeGen/RISCV/xaluo.ll | 243 +-
llvm/test/CodeGen/RISCV/xqcia.ll | 8 +-
.../SPARC/umulo-128-legalisation-lowering.ll | 164 +-
.../CodeGen/Thumb2/mve-saturating-arith.ll | 36 +-
.../umulo-128-legalisation-lowering.ll | 147 +-
.../CodeGen/X86/expand-vp-int-intrinsics.ll | 7 +-
llvm/test/CodeGen/X86/sat-add.ll | 29 +-
llvm/test/CodeGen/X86/uadd_sat_vec.ll | 271 +-
llvm/test/CodeGen/X86/vec_uaddo.ll | 834 ++--
22 files changed, 4661 insertions(+), 1806 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/carryout-selection.s
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 879dd4c12c0ba..b7a40a9f20519 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -304,12 +304,13 @@ define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) {
;
; CHECK-STREAMING-LABEL: lane_mask_v16i1_i8:
; CHECK-STREAMING: // %bb.0:
-; CHECK-STREAMING-NEXT: index z0.b, w0, #1
+; CHECK-STREAMING-NEXT: index z0.b, #0, #1
; CHECK-STREAMING-NEXT: mov z1.b, w0
; CHECK-STREAMING-NEXT: ptrue p0.b, vl16
-; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-STREAMING-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-STREAMING-NEXT: add z1.b, z1.b, z0.b
+; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z0.b, z1.b
+; CHECK-STREAMING-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT: orr z0.d, z1.d, z0.d
; CHECK-STREAMING-NEXT: mov z1.b, w1
; CHECK-STREAMING-NEXT: cmphi p0.b, p0/z, z1.b, z0.b
; CHECK-STREAMING-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
@@ -331,12 +332,13 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
;
; CHECK-STREAMING-LABEL: lane_mask_v8i1_i8:
; CHECK-STREAMING: // %bb.0:
-; CHECK-STREAMING-NEXT: index z0.b, w0, #1
+; CHECK-STREAMING-NEXT: index z0.b, #0, #1
; CHECK-STREAMING-NEXT: mov z1.b, w0
; CHECK-STREAMING-NEXT: ptrue p0.b, vl8
-; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-STREAMING-NEXT: orr z0.d, z0.d, z1.d
+; CHECK-STREAMING-NEXT: add z1.b, z1.b, z0.b
+; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z0.b, z1.b
+; CHECK-STREAMING-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT: orr z0.d, z1.d, z0.d
; CHECK-STREAMING-NEXT: mov z1.b, w1
; CHECK-STREAMING-NEXT: cmphi p0.b, p0/z, z1.b, z0.b
; CHECK-STREAMING-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 2f51208e49351..e4891496e337c 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -19,9 +19,9 @@ declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x
define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v1i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.2s, v0.2s, v1.2s
-; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: str s1, [x0]
+; CHECK-NEXT: add v2.2s, v0.2s, v1.2s
+; CHECK-NEXT: cmhi v0.2s, v1.2s, v2.2s
+; CHECK-NEXT: str s2, [x0]
; CHECK-NEXT: ret
%t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
%val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
@@ -34,9 +34,9 @@ define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.2s, v0.2s, v1.2s
-; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: str d1, [x0]
+; CHECK-NEXT: add v2.2s, v0.2s, v1.2s
+; CHECK-NEXT: cmhi v0.2s, v1.2s, v2.2s
+; CHECK-NEXT: str d2, [x0]
; CHECK-NEXT: ret
%t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -49,11 +49,11 @@ define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v3i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: mov s2, v1.s[2]
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: str d1, [x0]
-; CHECK-NEXT: str s2, [x0, #8]
+; CHECK-NEXT: add v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmhi v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: mov s1, v2.s[2]
+; CHECK-NEXT: str d2, [x0]
+; CHECK-NEXT: str s1, [x0, #8]
; CHECK-NEXT: ret
%t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
%val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -66,9 +66,9 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: str q1, [x0]
+; CHECK-NEXT: add v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmhi v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: str q2, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
%val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
@@ -94,21 +94,21 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; CHECK-NEXT: mov v0.s[2], w2
; CHECK-NEXT: ld1 { v1.s }[2], [x8]
; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: add v3.4s, v3.4s, v2.4s
; CHECK-NEXT: ld1 { v1.s }[3], [x8]
; CHECK-NEXT: ldr x8, [sp, #32]
; CHECK-NEXT: mov v0.s[3], w3
-; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s
-; CHECK-NEXT: str d2, [x8, #16]
-; CHECK-NEXT: mov w5, v3.s[1]
-; CHECK-NEXT: fmov w4, s3
-; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: str q1, [x8]
-; CHECK-NEXT: mov w1, v0.s[1]
-; CHECK-NEXT: mov w2, v0.s[2]
-; CHECK-NEXT: mov w3, v0.s[3]
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: cmhi v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: str d3, [x8, #16]
+; CHECK-NEXT: mov w5, v2.s[1]
+; CHECK-NEXT: fmov w4, s2
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmhi v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: str q0, [x8]
+; CHECK-NEXT: mov w1, v1.s[1]
+; CHECK-NEXT: mov w2, v1.s[2]
+; CHECK-NEXT: mov w3, v1.s[3]
+; CHECK-NEXT: fmov w0, s1
; CHECK-NEXT: ret
%t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
%val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -121,11 +121,11 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT: add v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: cmhi v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: stp q2, q3, [x0]
+; CHECK-NEXT: add v4.4s, v0.4s, v2.4s
+; CHECK-NEXT: add v5.4s, v1.4s, v3.4s
+; CHECK-NEXT: cmhi v0.4s, v2.4s, v4.4s
+; CHECK-NEXT: cmhi v1.4s, v3.4s, v5.4s
+; CHECK-NEXT: stp q4, q5, [x0]
; CHECK-NEXT: ret
%t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
%val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
@@ -139,7 +139,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v4.16b, v0.16b, v1.16b
-; CHECK-NEXT: cmhi v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: cmhi v0.16b, v1.16b, v4.16b
; CHECK-NEXT: str q4, [x0]
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
@@ -171,7 +171,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: cmhi v0.8h, v1.8h, v2.8h
; CHECK-NEXT: str q2, [x0]
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
@@ -194,9 +194,9 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
-; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: str q1, [x0]
+; CHECK-NEXT: add v2.2d, v0.2d, v1.2d
+; CHECK-NEXT: cmhi v0.2d, v1.2d, v2.2d
+; CHECK-NEXT: str q2, [x0]
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: ret
%t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 19b801a840ae0..2df0b8df50398 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -128,31 +128,6 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: sadd64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%add = add i64 %a, %b
store i64 %add, ptr addrspace(1) %out
@@ -263,30 +238,6 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: sadd64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_ADD_U]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%add = add i64 20015998343286, %a
store i64 %add, ptr addrspace(1) %out
@@ -389,29 +340,6 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: vadd64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -514,26 +442,6 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: vadd64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE2]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_ADD_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -642,24 +550,6 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: suaddo32
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
-; GCN-ISEL-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -796,35 +686,6 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: uaddo32_vcc_user
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
-; GCN-ISEL-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[V_ADD_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADD_CO_U32_e64_1]], implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -971,38 +832,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: suaddo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
-; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
-; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
-; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
-; GCN-ISEL-NEXT: [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO killed [[COPY10]], killed [[COPY12]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO killed [[COPY9]], killed [[COPY11]], killed [[S_UADDO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_UADDO]], %subreg.sub0, killed [[S_ADD_C]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE4]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY13]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_ADD_C1]], implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -1149,40 +978,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: vuaddo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 killed [[COPY3]], [[COPY1]](s32), 0, implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
-; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
-; GCN-ISEL-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY12]], [[COPY13]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_ADDC_U32_e64_1]], implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -1310,31 +1105,6 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: ssub64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%sub = sub i64 %a, %b
store i64 %sub, ptr addrspace(1) %out
@@ -1445,30 +1215,6 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: ssub64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%sub = sub i64 20015998343286, %a
store i64 %sub, ptr addrspace(1) %out
@@ -1571,29 +1317,6 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1]
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: vsub64rr
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1696,26 +1419,6 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: vsub64ri
-; GCN-ISEL: bb.0.entry:
-; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4660
-; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1450743926
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1825,24 +1528,6 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: susubo32
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
-; GCN-ISEL-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[COPY3]], killed [[COPY4]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_SUB_I32_]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[COPY5]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -1979,35 +1664,6 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: usubo32_vcc_user
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]]
-; GCN-ISEL-NEXT: [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64 killed [[COPY9]], [[COPY11]], 0, implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORD_OFFSET killed [[V_SUB_CO_U32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_SUB_CO_U32_e64_1]], implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -2029,21 +1685,20 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s11, 0xf000
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
-; CISI-NEXT: v_mov_b32_e32 v0, s6
-; CISI-NEXT: v_mov_b32_e32 v1, s7
-; CISI-NEXT: s_sub_u32 s6, s4, s6
-; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; CISI-NEXT: s_subb_u32 s7, s5, s7
-; CISI-NEXT: v_mov_b32_e32 v2, s6
+; CISI-NEXT: s_sub_u32 s4, s4, s6
+; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
+; CISI-NEXT: v_mov_b32_e32 v0, s4
+; CISI-NEXT: v_mov_b32_e32 v1, s5
+; CISI-NEXT: s_cselect_b64 s[4:5], -1, 0
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s10
; CISI-NEXT: s_mov_b32 s3, s11
-; CISI-NEXT: v_mov_b32_e32 v3, s7
-; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; CISI-NEXT: s_waitcnt expcnt(0)
+; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; CISI-NEXT: s_endpgm
;
@@ -2052,18 +1707,16 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v5, s7
; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
@@ -2072,14 +1725,12 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s14
-; GFX9-NEXT: v_mov_b32_e32 v1, s15
; GFX9-NEXT: s_sub_u32 s0, s12, s14
; GFX9-NEXT: s_subb_u32 s1, s13, s15
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: global_store_byte v2, v3, s[10:11]
; GFX9-NEXT: s_endpgm
@@ -2092,8 +1743,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
; GFX1010-NEXT: s_subb_u32 s1, s13, s15
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
-; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX1010-NEXT: global_store_byte v2, v3, s[10:11]
@@ -2104,11 +1755,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s8, s4, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
-; GFX1030W32-NEXT: s_subb_u32 s9, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9
+; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
+; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
@@ -2119,11 +1770,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s8, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s9, s5, s7
-; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[4:5], s[6:7]
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s8
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s9
+; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
+; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
@@ -2133,11 +1784,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s8, s4, s6
-; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[4:5], s[6:7]
-; GFX11-NEXT: s_subb_u32 s9, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s8
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: s_sub_u32 s4, s4, s6
+; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -2148,51 +1799,17 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_sub_nc_u64 s[0:1], s[12:13], s[14:15]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX1250-NEXT: v_cmp_gt_u64_e64 s0, s[12:13], s[14:15]
+; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
+; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9]
; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: susubo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
-; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
-; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
-; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
-; GCN-ISEL-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE5]]
-; GCN-ISEL-NEXT: [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[COPY13]], implicit $exec
-; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[S_SUB_U]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[COPY14]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -2212,22 +1829,21 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI: ; %bb.0:
; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; CISI-NEXT: v_mov_b32_e32 v1, 0
; CISI-NEXT: s_mov_b32 s7, 0xf000
; CISI-NEXT: s_mov_b32 s6, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_mov_b32 s4, s0
-; CISI-NEXT: v_mov_b32_e32 v3, s9
-; CISI-NEXT: v_sub_i32_e32 v2, vcc, s8, v0
-; CISI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
+; CISI-NEXT: v_mov_b32_e32 v1, s9
+; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
; CISI-NEXT: s_mov_b32 s5, s1
+; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CISI-NEXT: s_mov_b32 s0, s2
; CISI-NEXT: s_mov_b32 s1, s3
; CISI-NEXT: s_mov_b32 s2, s6
; CISI-NEXT: s_mov_b32 s3, s7
+; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CISI-NEXT: s_waitcnt expcnt(0)
; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; CISI-NEXT: s_endpgm
;
@@ -2235,34 +1851,31 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v7, s5
-; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v0
-; VI-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v6, s5
+; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0
+; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[6:7]
-; VI-NEXT: flat_store_byte v[4:5], v0
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: flat_store_byte v[3:4], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vusubo64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v0
-; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vusubo64:
@@ -2270,14 +1883,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: s_clause 0x1
; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1010-NEXT: v_mov_b32_e32 v1, 0
+; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1010-NEXT: v_sub_co_u32 v2, s4, s6, v0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v3, s4, s7, 0, s4
-; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1010-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX1010-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vusubo64:
@@ -2285,14 +1897,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1030W32-NEXT: v_sub_co_u32 v2, s4, s6, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1030W32-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vusubo64:
@@ -2300,14 +1911,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX1030W64-NEXT: v_sub_co_u32 v2, s[4:5], s6, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v3, null, s7, 0, s[4:5]
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX1030W64-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vusubo64:
@@ -2315,16 +1925,17 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX11-NEXT: v_sub_co_u32 v2, s4, s6, v0
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, s7, 0, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_u32 v0, s4, s6, v0
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v1, v[2:3], s[0:1]
-; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: vusubo64:
@@ -2333,50 +1944,18 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_sub_co_u32 v0, s4, s6, v0
+; GFX1250-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
-; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
; GFX1250-NEXT: s_endpgm
-; GCN-ISEL-LABEL: name: vusubo64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
-; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]](s32), %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
-; GCN-ISEL-NEXT: [[V_CMP_GT_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U64_e64 [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit $exec
-; GCN-ISEL-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[REG_SEQUENCE4]], [[REG_SEQUENCE5]], implicit-def dead $vcc, implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET killed [[V_SUB_U]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
-; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[V_CMP_GT_U64_e64_]], implicit $exec
-; GCN-ISEL-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
@@ -3626,292 +3205,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: .LBB16_4:
; GFX1250-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1250-NEXT: s_branch .LBB16_2
-; GCN-ISEL-LABEL: name: sudiv64
-; GCN-ISEL: bb.0 (%ir-block.0):
-; GCN-ISEL-NEXT: successors: %bb.3(0x50000000), %bb.1(0x30000000)
-; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
-; GCN-ISEL-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-; GCN-ISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-; GCN-ISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-; GCN-ISEL-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-; GCN-ISEL-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_192 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3, [[COPY2]], %subreg.sub4, [[COPY1]], %subreg.sub5
-; GCN-ISEL-NEXT: [[COPY7:%[0-9]+]]:sgpr_192 = COPY [[REG_SEQUENCE]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]]
-; GCN-ISEL-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
-; GCN-ISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_]], %subreg.sub0, killed [[COPY10]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-; GCN-ISEL-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
-; GCN-ISEL-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 killed [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-; GCN-ISEL-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-; GCN-ISEL-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[V_CMP_NE_U64_e64_]], implicit-def dead $scc
-; GCN-ISEL-NEXT: $vcc = COPY [[S_AND_B64_]]
-; GCN-ISEL-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc
-; GCN-ISEL-NEXT: S_BRANCH %bb.1
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: bb.1.Flow:
-; GCN-ISEL-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.0, %6, %bb.3
-; GCN-ISEL-NEXT: [[PHI1:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_1]], %bb.0, %40, %bb.3
-; GCN-ISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI1]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-; GCN-ISEL-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
-; GCN-ISEL-NEXT: S_CMP_LG_U32 killed [[COPY12]], killed [[S_MOV_B32_1]], implicit-def $scc
-; GCN-ISEL-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc
-; GCN-ISEL-NEXT: S_BRANCH %bb.2
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: bb.2 (%ir-block.7):
-; GCN-ISEL-NEXT: successors: %bb.4(0x80000000)
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
-; GCN-ISEL-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 killed [[S_MOV_B32_2]], [[COPY13]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY13]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[V_RCP_IFLAG_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 killed [[V_CVT_F32_U32_e32_]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1333788670, killed [[V_RCP_IFLAG_F32_e32_]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed [[V_MUL_F32_e32_]], implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_SUB_I32_]], [[COPY15]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e32_]], killed [[S_MUL_I32_]], implicit $exec
-; GCN-ISEL-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e32_]]
-; GCN-ISEL-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_]]
-; GCN-ISEL-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY16]], killed [[COPY17]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY killed [[S_ADD_I32_]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY14]], [[COPY18]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-; GCN-ISEL-NEXT: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
-; GCN-ISEL-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY19]], [[S_MOV_B32_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_1:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY20]], [[COPY13]]
-; GCN-ISEL-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY14]], killed [[S_MUL_I32_1]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_SUB_I32_1]], [[COPY13]], implicit-def dead $scc
-; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_SUB_I32_1]], [[COPY13]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_SUB_I32_2]], [[S_SUB_I32_1]], implicit $scc
-; GCN-ISEL-NEXT: [[COPY21:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_1]]
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_1]], [[COPY21]], implicit $scc
-; GCN-ISEL-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_CSELECT_B32_1]], [[S_MOV_B32_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: S_CMP_GE_U32 killed [[S_CSELECT_B32_]], [[COPY13]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_ADD_I32_2]], [[S_CSELECT_B32_1]], implicit $scc
-; GCN-ISEL-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_2]], %subreg.sub0, killed [[S_MOV_B32_4]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY22:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE4]]
-; GCN-ISEL-NEXT: S_BRANCH %bb.4
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: bb.3 (%ir-block.12):
-; GCN-ISEL-NEXT: successors: %bb.1(0x80000000)
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: [[COPY23:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub0
-; GCN-ISEL-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY23]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY9]].sub1
-; GCN-ISEL-NEXT: [[V_CVT_F32_U32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[COPY24]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 1333788672
-; GCN-ISEL-NEXT: [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_CVT_F32_U32_e64_1]], 0, killed [[S_MOV_B32_5]], 0, killed [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 1602224124
-; GCN-ISEL-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_6]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 796917760
-; GCN-ISEL-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_MUL_F32_e64_]], 0, killed [[S_MOV_B32_7]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[V_TRUNC_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e64 0, killed [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 -813694976
-; GCN-ISEL-NEXT: [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, killed [[S_MOV_B32_8]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-; GCN-ISEL-NEXT: [[S_SUB_U:%[0-9]+]]:sreg_64 = S_SUB_U64_PSEUDO killed [[S_MOV_B64_2]], [[COPY9]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub1
-; GCN-ISEL-NEXT: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_2:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[COPY26]]
-; GCN-ISEL-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[S_SUB_U]].sub0
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[V_CVT_U32_F32_e64_]], implicit $exec
-; GCN-ISEL-NEXT: [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_TRUNC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
-; GCN-ISEL-NEXT: [[COPY28:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_3:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY28]]
-; GCN-ISEL-NEXT: [[COPY29:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_2]]
-; GCN-ISEL-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY29]], killed [[S_MUL_I32_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_3]], killed [[S_MUL_I32_2]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_ADD_I32_4]], implicit $exec
-; GCN-ISEL-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_4:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY30]], [[S_ADD_I32_4]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_4]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_3]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_5:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[COPY31]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[S_MUL_I32_5]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_4]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE6]], killed [[REG_SEQUENCE5]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub0
-; GCN-ISEL-NEXT: [[COPY33:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U]].sub1
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_ADD_I32_4]], implicit $exec
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[S_MUL_I32_5]], implicit $exec
-; GCN-ISEL-NEXT: [[COPY34:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_6:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY34]], [[S_MUL_I32_5]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_6]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_6]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY35:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub0
-; GCN-ISEL-NEXT: [[COPY36:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE7]].sub1
-; GCN-ISEL-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-; GCN-ISEL-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY32]], killed [[COPY35]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY33]], killed [[COPY36]], implicit-def $scc, implicit $scc
-; GCN-ISEL-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_5]]
-; GCN-ISEL-NEXT: [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY37]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
-; GCN-ISEL-NEXT: [[COPY38:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT: [[S_MUL_I32_7:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY38]], [[S_ADD_I32_4]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_7]], %subreg.sub0, killed [[S_ADDC_U32_1]], %subreg.sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_]], %subreg.sub0, killed [[S_ADDC_U32_]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY39:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE9]].sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY39]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U1:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE10]], killed [[REG_SEQUENCE8]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY40:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub0
-; GCN-ISEL-NEXT: [[COPY41:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_]]
-; GCN-ISEL-NEXT: [[S_UADDO:%[0-9]+]]:sreg_32, [[S_UADDO1:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[COPY41]], killed [[COPY40]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY42:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U1]].sub1
-; GCN-ISEL-NEXT: [[COPY43:%[0-9]+]]:sreg_32 = COPY [[V_CVT_U32_F32_e64_1]]
-; GCN-ISEL-NEXT: [[S_ADD_C:%[0-9]+]]:sreg_32, [[S_ADD_C1:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[COPY43]], killed [[COPY42]], killed [[S_UADDO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_MUL_I32_8:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_ADD_C]]
-; GCN-ISEL-NEXT: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY27]], [[COPY44]], implicit $exec
-; GCN-ISEL-NEXT: [[COPY45:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_7]]
-; GCN-ISEL-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY45]], killed [[S_MUL_I32_8]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_MUL_I32_9:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY25]], [[S_UADDO]]
-; GCN-ISEL-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_5]], killed [[S_MUL_I32_9]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY46]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MUL_I32_10:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY27]], [[S_UADDO]]
-; GCN-ISEL-NEXT: [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_ADD_C]], [[COPY47]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MUL_I32_11:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_MUL_I32_10]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE11:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_11]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_9]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY48:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub0
-; GCN-ISEL-NEXT: [[COPY49:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub1
-; GCN-ISEL-NEXT: [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY50]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MUL_I32_12:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_UADDO]], [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE12:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_12]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_10]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MUL_I32_10]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_11:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[S_UADDO]], [[COPY51]], implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE13:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_11]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U2:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE13]], killed [[REG_SEQUENCE12]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY52:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub0
-; GCN-ISEL-NEXT: [[COPY53:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U2]].sub1
-; GCN-ISEL-NEXT: [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY52]], killed [[COPY48]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY53]], killed [[COPY49]], implicit-def $scc, implicit $scc
-; GCN-ISEL-NEXT: [[COPY54:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_8]]
-; GCN-ISEL-NEXT: [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY54]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
-; GCN-ISEL-NEXT: [[S_MUL_I32_13:%[0-9]+]]:sreg_32 = S_MUL_I32 [[S_ADD_C]], [[S_ADD_I32_6]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE14:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_13]], %subreg.sub0, killed [[S_ADDC_U32_3]], %subreg.sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE15:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_1]], %subreg.sub0, killed [[S_ADDC_U32_2]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY55:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE15]].sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE16:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY55]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U3:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE16]], killed [[REG_SEQUENCE14]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY56:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub0
-; GCN-ISEL-NEXT: [[S_UADDO2:%[0-9]+]]:sreg_32, [[S_UADDO3:%[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO [[S_UADDO]], killed [[COPY56]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY57:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U3]].sub1
-; GCN-ISEL-NEXT: [[S_ADD_C2:%[0-9]+]]:sreg_32, [[S_ADD_C3:%[0-9]+]]:sreg_64_xexec = S_ADD_CO_PSEUDO [[S_ADD_C]], killed [[COPY57]], killed [[S_UADDO3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY58:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub0
-; GCN-ISEL-NEXT: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_12:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY59]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MUL_I32_14:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY58]], [[S_ADD_C2]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE17:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_14]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_12]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_13:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY58]], [[COPY60]], implicit $exec
-; GCN-ISEL-NEXT: [[REG_SEQUENCE18:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_MUL_HI_U32_e64_13]], %subreg.sub0, [[S_MOV_B32_9]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U4:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE18]], killed [[REG_SEQUENCE17]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY61:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub0
-; GCN-ISEL-NEXT: [[COPY62:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U4]].sub1
-; GCN-ISEL-NEXT: [[COPY63:%[0-9]+]]:sreg_32 = COPY [[COPY8]].sub1
-; GCN-ISEL-NEXT: [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_C2]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_14:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY64]], implicit $exec
-; GCN-ISEL-NEXT: [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_UADDO2]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_15:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY63]], [[COPY65]], implicit $exec
-; GCN-ISEL-NEXT: [[S_MUL_I32_15:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_UADDO2]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE19:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_15]], %subreg.sub0, killed [[V_MUL_HI_U32_e64_15]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY66:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub0
-; GCN-ISEL-NEXT: [[COPY67:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub1
-; GCN-ISEL-NEXT: [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 killed [[COPY61]], killed [[COPY66]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY62]], killed [[COPY67]], implicit-def $scc, implicit $scc
-; GCN-ISEL-NEXT: [[COPY68:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_14]]
-; GCN-ISEL-NEXT: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 killed [[COPY68]], [[S_MOV_B32_10]], implicit-def dead $scc, implicit $scc
-; GCN-ISEL-NEXT: [[S_MUL_I32_16:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY63]], [[S_ADD_C2]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE20:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_16]], %subreg.sub0, killed [[S_ADDC_U32_5]], %subreg.sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE21:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_2]], %subreg.sub0, killed [[S_ADDC_U32_4]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY69:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE21]].sub1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE22:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY69]], %subreg.sub0, [[S_MOV_B32_10]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_ADD_U5:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO killed [[REG_SEQUENCE22]], killed [[REG_SEQUENCE20]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY70:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub1
-; GCN-ISEL-NEXT: [[S_MUL_I32_17:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY70]]
-; GCN-ISEL-NEXT: [[COPY71:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U5]].sub0
-; GCN-ISEL-NEXT: [[COPY72:%[0-9]+]]:vgpr_32 = COPY [[COPY71]]
-; GCN-ISEL-NEXT: [[V_MUL_HI_U32_e64_16:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[COPY23]], [[COPY72]], implicit $exec
-; GCN-ISEL-NEXT: [[COPY73:%[0-9]+]]:sreg_32 = COPY [[V_MUL_HI_U32_e64_16]]
-; GCN-ISEL-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[COPY73]], killed [[S_MUL_I32_17]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_MUL_I32_18:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY24]], [[COPY71]]
-; GCN-ISEL-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_7]], killed [[S_MUL_I32_18]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_SUB_I32_3:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY63]], [[S_ADD_I32_8]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_MUL_I32_19:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY23]], [[COPY71]]
-; GCN-ISEL-NEXT: [[S_USUBO:%[0-9]+]]:sreg_32, [[S_USUBO1:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[COPY58]], killed [[S_MUL_I32_19]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_SUB_C:%[0-9]+]]:sreg_32, [[S_SUB_C1:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_I32_3]], [[COPY24]], [[S_USUBO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_USUBO2:%[0-9]+]]:sreg_32, [[S_USUBO3:%[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO [[S_USUBO]], [[COPY23]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[S_SUB_C2:%[0-9]+]]:sreg_32, [[S_SUB_C3:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO killed [[S_SUB_C]], [[S_MOV_B32_10]], killed [[S_USUBO3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_3:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT: S_CMP_GE_U32 killed [[S_USUBO2]], [[COPY23]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_4:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT: S_CMP_EQ_U32 [[S_SUB_C2]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_5:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_4]], killed [[S_CSELECT_B32_3]], implicit $scc
-; GCN-ISEL-NEXT: [[COPY74:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_5]]
-; GCN-ISEL-NEXT: [[REG_SEQUENCE23:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY71]], %subreg.sub0, [[COPY70]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-; GCN-ISEL-NEXT: [[S_ADD_U6:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_3]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY75:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B64_4:%[0-9]+]]:sreg_64 = S_MOV_B64 2
-; GCN-ISEL-NEXT: [[S_ADD_U7:%[0-9]+]]:sreg_64 = S_ADD_U64_PSEUDO [[REG_SEQUENCE23]], killed [[S_MOV_B64_4]], implicit-def dead $scc
-; GCN-ISEL-NEXT: [[COPY76:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub0
-; GCN-ISEL-NEXT: S_CMP_LG_U32 killed [[COPY74]], [[S_MOV_B32_10]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_6:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY76]], killed [[COPY75]], implicit $scc
-; GCN-ISEL-NEXT: [[COPY77:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U6]].sub1
-; GCN-ISEL-NEXT: [[COPY78:%[0-9]+]]:sreg_32 = COPY [[S_ADD_U7]].sub1
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_7:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[COPY78]], killed [[COPY77]], implicit $scc
-; GCN-ISEL-NEXT: [[S_SUB_C4:%[0-9]+]]:sreg_32, [[S_SUB_C5:%[0-9]+]]:sreg_64_xexec = S_SUB_CO_PSEUDO [[COPY63]], [[S_ADD_I32_8]], [[S_USUBO1]], implicit-def dead $scc
-; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_8:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT: S_CMP_GE_U32 [[S_USUBO]], [[COPY23]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_9:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_MOV_B32_11]], [[S_MOV_B32_10]], implicit $scc
-; GCN-ISEL-NEXT: S_CMP_EQ_U32 [[S_SUB_C4]], [[COPY24]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_10:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_9]], killed [[S_CSELECT_B32_8]], implicit $scc
-; GCN-ISEL-NEXT: [[COPY79:%[0-9]+]]:sreg_32 = COPY killed [[S_CSELECT_B32_10]]
-; GCN-ISEL-NEXT: S_CMP_LG_U32 killed [[COPY79]], [[S_MOV_B32_10]], implicit-def $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_11:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_7]], [[COPY70]], implicit $scc
-; GCN-ISEL-NEXT: [[S_CSELECT_B32_12:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_CSELECT_B32_6]], [[COPY71]], implicit $scc
-; GCN-ISEL-NEXT: [[REG_SEQUENCE24:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_12]], %subreg.sub0, killed [[S_CSELECT_B32_11]], %subreg.sub1
-; GCN-ISEL-NEXT: [[S_MOV_B64_5:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-; GCN-ISEL-NEXT: [[COPY80:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE24]]
-; GCN-ISEL-NEXT: S_BRANCH %bb.1
-; GCN-ISEL-NEXT: {{ $}}
-; GCN-ISEL-NEXT: bb.4 (%ir-block.14):
-; GCN-ISEL-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[PHI]], %bb.1, [[COPY22]], %bb.2
-; GCN-ISEL-NEXT: [[COPY81:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub1
-; GCN-ISEL-NEXT: [[COPY82:%[0-9]+]]:sreg_32 = COPY [[COPY7]].sub0
-; GCN-ISEL-NEXT: [[REG_SEQUENCE25:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY82]], %subreg.sub0, killed [[COPY81]], %subreg.sub1
-; GCN-ISEL-NEXT: [[COPY83:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub1
-; GCN-ISEL-NEXT: [[COPY84:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub0
-; GCN-ISEL-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-; GCN-ISEL-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-; GCN-ISEL-NEXT: [[REG_SEQUENCE26:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY84]], %subreg.sub0, killed [[COPY83]], %subreg.sub1, killed [[S_MOV_B32_13]], %subreg.sub2, killed [[S_MOV_B32_12]], %subreg.sub3
-; GCN-ISEL-NEXT: [[COPY85:%[0-9]+]]:vreg_64 = COPY [[PHI2]]
-; GCN-ISEL-NEXT: BUFFER_STORE_DWORDX2_OFFSET [[COPY85]], killed [[REG_SEQUENCE26]], 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.16, addrspace 1)
-; GCN-ISEL-NEXT: S_ENDPGM 0
%result = udiv i64 %x, %y
store i64 %result, ptr addrspace(1) %out
ret void
@@ -3932,3 +3225,5 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-ISEL: {{.*}}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.s b/llvm/test/CodeGen/AMDGPU/carryout-selection.s
new file mode 100644
index 0000000000000..db52017f9091f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.s
@@ -0,0 +1,3547 @@
+--- |
+ ; ModuleID = '../llvm/test/CodeGen/AMDGPU/carryout-selection.ll'
+ source_filename = "../llvm/test/CodeGen/AMDGPU/carryout-selection.ll"
+ target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+ target triple = "amdgcn"
+
+ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+ entry:
+ %sadd64rr.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %sadd64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %0 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <3 x i64> %0, i32 0
+ %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %a.load2 = extractelement <3 x i64> %0, i32 1
+ %b.load3 = extractelement <3 x i64> %0, i32 2
+ %add = add i64 %a.load2, %b.load3
+ store i64 %add, ptr addrspace(1) %1, align 8
+ ret void
+ }
+
+ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
+ entry:
+ %sadd64ri.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %sadd64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <2 x i64> %0, i32 0
+ %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %a.load2 = extractelement <2 x i64> %0, i32 1
+ %add = add i64 20015998343286, %a.load2
+ store i64 %add, ptr addrspace(1) %1, align 8
+ ret void
+ }
+
+ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
+ entry:
+ %vadd64rr.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vadd64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <2 x i64> %0, i32 0
+ %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %a.load2 = extractelement <2 x i64> %0, i32 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %add = add i64 %a.load2, %tid.ext
+ store i64 %add, ptr addrspace(1) %1, align 8
+ ret void
+ }
+
+ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
+ entry:
+ %vadd64ri.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vadd64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %add = add i64 20015998343286, %tid.ext
+ store i64 %add, ptr addrspace(1) %out.load, align 8
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+ %suaddo32.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %suaddo32.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %suaddo32.kernarg.segment, i64 52, !amdgpu.uniform !0
+ %1 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+ %a.load1 = extractelement <2 x i32> %1, i32 0
+ %b.load2 = extractelement <2 x i32> %1, i32 1
+ %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a.load1, i32 %b.load2)
+ %val = extractvalue { i32, i1 } %uadd, 0
+ store i32 %val, ptr addrspace(1) %out.load, align 4
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+ %uaddo32_vcc_user.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %uaddo32_vcc_user.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %1 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <2 x i64> %1, i32 0
+ %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %carryout.load2 = extractelement <2 x i64> %1, i32 1
+ %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+ %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %uaddo32_vcc_user.kernarg.segment, i64 52, !amdgpu.uniform !0
+ %4 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+ %a.load3 = extractelement <2 x i32> %4, i32 0
+ %b.load4 = extractelement <2 x i32> %4, i32 1
+ %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a.load3, i32 %b.load4)
+ %val = extractvalue { i32, i1 } %uadd, 0
+ %carry = extractvalue { i32, i1 } %uadd, 1
+ store i32 %val, ptr addrspace(1) %2, align 4
+ store i1 %carry, ptr addrspace(1) %3, align 1
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #1 {
+ %suaddo64.kernarg.segment = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %suaddo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %1 = load <4 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <4 x i64> %1, i32 0
+ %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %carryout.load2 = extractelement <4 x i64> %1, i32 1
+ %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+ %a.load3 = extractelement <4 x i64> %1, i32 2
+ %b.load4 = extractelement <4 x i64> %1, i32 3
+ %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a.load3, i64 %b.load4)
+ %val = extractvalue { i64, i1 } %uadd, 0
+ %carry = extractvalue { i64, i1 } %uadd, 1
+ store i64 %val, ptr addrspace(1) %2, align 8
+ store i1 %carry, ptr addrspace(1) %3, align 1
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #1 {
+ %vuaddo64.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vuaddo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %1 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <3 x i64> %1, i32 0
+ %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %carryout.load2 = extractelement <3 x i64> %1, i32 1
+ %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+ %a.load3 = extractelement <3 x i64> %1, i32 2
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a.load3, i64 %tid.ext)
+ %val = extractvalue { i64, i1 } %uadd, 0
+ %carry = extractvalue { i64, i1 } %uadd, 1
+ store i64 %val, ptr addrspace(1) %2, align 8
+ store i1 %carry, ptr addrspace(1) %3, align 1
+ ret void
+ }
+
+ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+ entry:
+ %ssub64rr.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %ssub64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %0 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <3 x i64> %0, i32 0
+ %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %a.load2 = extractelement <3 x i64> %0, i32 1
+ %b.load3 = extractelement <3 x i64> %0, i32 2
+ %sub = sub i64 %a.load2, %b.load3
+ store i64 %sub, ptr addrspace(1) %1, align 8
+ ret void
+ }
+
+ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
+ entry:
+ %ssub64ri.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %ssub64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <2 x i64> %0, i32 0
+ %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %a.load2 = extractelement <2 x i64> %0, i32 1
+ %sub = sub i64 20015998343286, %a.load2
+ store i64 %sub, ptr addrspace(1) %1, align 8
+ ret void
+ }
+
+ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
+ entry:
+ %vsub64rr.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vsub64rr.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %0 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <2 x i64> %0, i32 0
+ %1 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %a.load2 = extractelement <2 x i64> %0, i32 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %sub = sub i64 %a.load2, %tid.ext
+ store i64 %sub, ptr addrspace(1) %1, align 8
+ ret void
+ }
+
+ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
+ entry:
+ %vsub64ri.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vsub64ri.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %sub = sub i64 20015998343286, %tid.ext
+ store i64 %sub, ptr addrspace(1) %out.load, align 8
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+ %susubo32.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %susubo32.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %susubo32.kernarg.segment, i64 52, !amdgpu.uniform !0
+ %1 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+ %a.load1 = extractelement <2 x i32> %1, i32 0
+ %b.load2 = extractelement <2 x i32> %1, i32 1
+ %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a.load1, i32 %b.load2)
+ %val = extractvalue { i32, i1 } %usub, 0
+ store i32 %val, ptr addrspace(1) %out.load, align 4
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #1 {
+ %usubo32_vcc_user.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %usubo32_vcc_user.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %1 = load <2 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <2 x i64> %1, i32 0
+ %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %carryout.load2 = extractelement <2 x i64> %1, i32 1
+ %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+ %a.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %usubo32_vcc_user.kernarg.segment, i64 52, !amdgpu.uniform !0
+ %4 = load <2 x i32>, ptr addrspace(4) %a.kernarg.offset, align 4, !invariant.load !0
+ %a.load3 = extractelement <2 x i32> %4, i32 0
+ %b.load4 = extractelement <2 x i32> %4, i32 1
+ %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a.load3, i32 %b.load4)
+ %val = extractvalue { i32, i1 } %usub, 0
+ %carry = extractvalue { i32, i1 } %usub, 1
+ store i32 %val, ptr addrspace(1) %2, align 4
+ store i1 %carry, ptr addrspace(1) %3, align 1
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #1 {
+ %susubo64.kernarg.segment = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %susubo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %1 = load <4 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <4 x i64> %1, i32 0
+ %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %carryout.load2 = extractelement <4 x i64> %1, i32 1
+ %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+ %a.load3 = extractelement <4 x i64> %1, i32 2
+ %b.load4 = extractelement <4 x i64> %1, i32 3
+ %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a.load3, i64 %b.load4)
+ %val = extractvalue { i64, i1 } %usub, 0
+ %carry = extractvalue { i64, i1 } %usub, 1
+ store i64 %val, ptr addrspace(1) %2, align 8
+ store i1 %carry, ptr addrspace(1) %3, align 1
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #1 {
+ %vusubo64.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %vusubo64.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %1 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %out.load1 = extractelement <3 x i64> %1, i32 0
+ %2 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ %carryout.load2 = extractelement <3 x i64> %1, i32 1
+ %3 = inttoptr i64 %carryout.load2 to ptr addrspace(1)
+ %a.load3 = extractelement <3 x i64> %1, i32 2
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a.load3, i64 %tid.ext)
+ %val = extractvalue { i64, i1 } %usub, 0
+ %carry = extractvalue { i64, i1 } %usub, 1
+ store i64 %val, ptr addrspace(1) %2, align 8
+ store i1 %carry, ptr addrspace(1) %3, align 1
+ ret void
+ }
+
+ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
+ %sudiv64.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %out.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %sudiv64.kernarg.segment, i64 36, !amdgpu.uniform !0
+ %1 = load <3 x i64>, ptr addrspace(4) %out.kernarg.offset, align 4, !invariant.load !0
+ %x.load2 = extractelement <3 x i64> %1, i32 1
+ %y.load3 = extractelement <3 x i64> %1, i32 2
+ %2 = or i64 %x.load2, %y.load3
+ %3 = and i64 %2, -4294967296
+ %4 = icmp ne i64 %3, 0
+ br i1 %4, label %12, label %Flow, !amdgpu.uniform !0
+
+ Flow: ; preds = %12, %0
+ %5 = phi i64 [ %13, %12 ], [ poison, %0 ]
+ %6 = phi i1 [ false, %12 ], [ true, %0 ]
+ br i1 %6, label %7, label %14, !amdgpu.uniform !0
+
+ 7: ; preds = %Flow
+ %8 = trunc i64 %y.load3 to i32
+ %9 = trunc i64 %x.load2 to i32
+ %10 = udiv i32 %9, %8
+ %11 = zext i32 %10 to i64
+ br label %14, !amdgpu.uniform !0
+
+ 12: ; preds = %0
+ %13 = udiv i64 %x.load2, %y.load3
+ br label %Flow, !amdgpu.uniform !0
+
+ 14: ; preds = %7, %Flow
+ %15 = phi i64 [ %5, %Flow ], [ %11, %7 ]
+ %out.load1 = extractelement <3 x i64> %1, i32 0
+ %16 = inttoptr i64 %out.load1 to ptr addrspace(1)
+ store i64 %15, ptr addrspace(1) %16, align 8
+ ret void
+ }
+
+ ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+ declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #2
+
+ ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+ declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #2
+
+ ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+ declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #2
+
+ ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
+ declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #2
+
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+ declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #3
+
+ ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+ declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3
+
+ attributes #0 = { "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" }
+ attributes #1 = { nounwind }
+ attributes #2 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+ attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+ !0 = !{}
+...
+---
+name: sadd64rr
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 28, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: true
+ waveLimiter: true
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+ %13:sreg_32 = COPY %12.sub1
+ %14:sreg_32 = COPY %12.sub0
+ %15:sreg_32 = COPY %11.sub3
+ %16:sreg_32 = COPY %11.sub2
+ %17:sreg_32 = COPY %11.sub1
+ %18:sreg_32 = COPY %11.sub0
+ %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+ %20:sreg_32 = COPY %19.sub1
+ %21:sreg_32 = COPY %19.sub0
+ %22:sreg_32 = S_MOV_B32 61440
+ %23:sreg_32 = S_MOV_B32 -1
+ %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, killed %23, %subreg.sub2, killed %22, %subreg.sub3
+ %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+ %26:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+ %27:sreg_64 = S_ADD_U64_PSEUDO killed %25, killed %26, implicit-def dead $scc
+ %28:vreg_64 = COPY %27
+ BUFFER_STORE_DWORDX2_OFFSET killed %28, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: sadd64ri
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 27, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 16
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+ %15:sreg_32 = COPY %14.sub1
+ %16:sreg_32 = COPY %14.sub0
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_32 = S_MOV_B32 -1
+ %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+ %20:sreg_32 = COPY %11.sub3
+ %21:sreg_32 = COPY %11.sub2
+ %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+ %23:sreg_32 = S_MOV_B32 4660
+ %24:sreg_32 = S_MOV_B32 1450743926
+ %25:sreg_64 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1
+ %26:sreg_64 = S_ADD_U64_PSEUDO killed %22, killed %25, implicit-def dead $scc
+ %27:vreg_64 = COPY %26
+ BUFFER_STORE_DWORDX2_OFFSET killed %27, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: vadd64rr
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 23, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 25, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$vgpr0', virtual-reg: '%0' }
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 16
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %0:vgpr_32(s32) = COPY $vgpr0
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+ %15:sreg_32 = COPY %14.sub1
+ %16:sreg_32 = COPY %14.sub0
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_32 = S_MOV_B32 -1
+ %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+ %20:sreg_32 = COPY %11.sub3
+ %21:sreg_32 = COPY %11.sub2
+ %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+ %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %24:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %23, %subreg.sub1
+ %25:vreg_64 = V_ADD_U64_PSEUDO killed %22, killed %24, implicit-def dead $vcc, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFSET killed %25, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: vadd64ri
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 17, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 22, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$vgpr0', virtual-reg: '%0' }
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 8
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %0:vgpr_32(s32) = COPY $vgpr0
+ %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_32 = S_MOV_B32 61440
+ %15:sreg_32 = S_MOV_B32 -1
+ %16:sgpr_128 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1, killed %15, %subreg.sub2, killed %14, %subreg.sub3
+ %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %18:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %17, %subreg.sub1
+ %19:sreg_32 = S_MOV_B32 4660
+ %20:sreg_32 = S_MOV_B32 1450743926
+ %21:sreg_64 = REG_SEQUENCE killed %20, %subreg.sub0, killed %19, %subreg.sub1
+ %22:vreg_64 = V_ADD_U64_PSEUDO killed %18, killed %21, implicit-def dead $vcc, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFSET killed %22, killed %16, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: suaddo32
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ %13:sreg_32 = COPY %11.sub1
+ %14:sreg_32 = COPY %11.sub0
+ %15:sreg_32 = S_MOV_B32 61440
+ %16:sreg_32 = S_MOV_B32 -1
+ %17:sgpr_128 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1, killed %16, %subreg.sub2, killed %15, %subreg.sub3
+ %18:sreg_32 = COPY %12.sub0
+ %19:sreg_32 = COPY %12.sub1
+ %20:sreg_32 = S_ADD_I32 killed %18, killed %19, implicit-def dead $scc
+ %21:vgpr_32 = COPY %20
+ BUFFER_STORE_DWORD_OFFSET killed %21, killed %17, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: uaddo32_vcc_user
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 28, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 30, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 31, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 32, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ %13:sreg_32 = COPY %11.sub1
+ %14:sreg_32 = COPY %11.sub0
+ %15:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+ %16:sreg_32 = COPY %15.sub1
+ %17:sreg_32 = COPY %15.sub0
+ %18:sreg_32 = S_MOV_B32 61440
+ %19:sreg_32 = S_MOV_B32 -1
+ %20:sgpr_128 = REG_SEQUENCE killed %17, %subreg.sub0, killed %16, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+ %21:sreg_32 = COPY %11.sub3
+ %22:sreg_32 = COPY %11.sub2
+ %23:sreg_64 = REG_SEQUENCE killed %22, %subreg.sub0, killed %21, %subreg.sub1
+ %24:sreg_32 = COPY %23.sub1
+ %25:sreg_32 = COPY %23.sub0
+ %26:sgpr_128 = REG_SEQUENCE killed %25, %subreg.sub0, killed %24, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+ %27:sreg_32 = COPY %12.sub0
+ %28:sreg_32 = COPY %12.sub1
+ %31:vgpr_32 = COPY killed %28
+ %29:vgpr_32, %30:sreg_64_xexec = V_ADD_CO_U32_e64 killed %27, %31, 0, implicit $exec
+ BUFFER_STORE_DWORD_OFFSET killed %29, killed %20, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+ %32:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %30, implicit $exec
+ BUFFER_STORE_BYTE_OFFSET killed %32, killed %26, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: suaddo64
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_256, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 28, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 29, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 30, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 31, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 32, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 33, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 34, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 35, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 36, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 32
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_256 = S_LOAD_DWORDX8_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+ %15:sreg_32 = COPY %14.sub1
+ %16:sreg_32 = COPY %14.sub0
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_32 = S_MOV_B32 -1
+ %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+ %20:sreg_32 = COPY %11.sub3
+ %21:sreg_32 = COPY %11.sub2
+ %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+ %23:sreg_32 = COPY %22.sub1
+ %24:sreg_32 = COPY %22.sub0
+ %25:sgpr_128 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+ %26:sreg_32 = COPY %11.sub5
+ %27:sreg_32 = COPY %11.sub4
+ %28:sreg_32 = COPY %11.sub7
+ %29:sreg_32 = COPY %11.sub6
+ %30:sreg_32, %31:sreg_64_xexec = S_UADDO_PSEUDO killed %27, killed %29, implicit-def dead $scc
+ %32:sreg_32, %33:sreg_64_xexec = S_ADD_CO_PSEUDO killed %26, killed %28, killed %31, implicit-def dead $scc
+ %34:sreg_64 = REG_SEQUENCE killed %30, %subreg.sub0, killed %32, %subreg.sub1
+ %35:vreg_64 = COPY %34
+ BUFFER_STORE_DWORDX2_OFFSET killed %35, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+ %36:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %33, implicit $exec
+ BUFFER_STORE_BYTE_OFFSET killed %36, killed %25, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: vuaddo64
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 28, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 30, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 31, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 32, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 33, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 34, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 35, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 36, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 37, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$vgpr0', virtual-reg: '%0' }
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %0:vgpr_32(s32) = COPY $vgpr0
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+ %13:sreg_32 = COPY %12.sub1
+ %14:sreg_32 = COPY %12.sub0
+ %15:sreg_32 = COPY %11.sub3
+ %16:sreg_32 = COPY %11.sub2
+ %17:sreg_32 = COPY %11.sub1
+ %18:sreg_32 = COPY %11.sub0
+ %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+ %20:sreg_32 = COPY %19.sub1
+ %21:sreg_32 = COPY %19.sub0
+ %22:sreg_32 = S_MOV_B32 61440
+ %23:sreg_32 = S_MOV_B32 -1
+ %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+ %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+ %26:sreg_32 = COPY %25.sub1
+ %27:sreg_32 = COPY %25.sub0
+ %28:sgpr_128 = REG_SEQUENCE killed %27, %subreg.sub0, killed %26, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+ %29:vgpr_32, %30:sreg_64_xexec = V_ADD_CO_U32_e64 killed %14, %0(s32), 0, implicit $exec
+ %31:sreg_32 = S_MOV_B32 0
+ %34:vgpr_32 = COPY killed %13
+ %35:vgpr_32 = COPY killed %31
+ %32:vgpr_32, %33:sreg_64_xexec = V_ADDC_U32_e64 %34, %35, killed %30, 0, implicit $exec
+ %36:vreg_64 = REG_SEQUENCE killed %29, %subreg.sub0, killed %32, %subreg.sub1
+ BUFFER_STORE_DWORDX2_OFFSET killed %36, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+ %37:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %33, implicit $exec
+ BUFFER_STORE_BYTE_OFFSET killed %37, killed %28, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: ssub64rr
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 28, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: true
+ waveLimiter: true
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+ %13:sreg_32 = COPY %12.sub1
+ %14:sreg_32 = COPY %12.sub0
+ %15:sreg_32 = COPY %11.sub3
+ %16:sreg_32 = COPY %11.sub2
+ %17:sreg_32 = COPY %11.sub1
+ %18:sreg_32 = COPY %11.sub0
+ %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+ %20:sreg_32 = COPY %19.sub1
+ %21:sreg_32 = COPY %19.sub0
+ %22:sreg_32 = S_MOV_B32 61440
+ %23:sreg_32 = S_MOV_B32 -1
+ %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, killed %23, %subreg.sub2, killed %22, %subreg.sub3
+ %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+ %26:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+ %27:sreg_64 = S_SUB_U64_PSEUDO killed %25, killed %26, implicit-def dead $scc
+ %28:vreg_64 = COPY %27
+ BUFFER_STORE_DWORDX2_OFFSET killed %28, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: ssub64ri
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 27, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 16
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+ %15:sreg_32 = COPY %14.sub1
+ %16:sreg_32 = COPY %14.sub0
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_32 = S_MOV_B32 -1
+ %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+ %20:sreg_32 = COPY %11.sub3
+ %21:sreg_32 = COPY %11.sub2
+ %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+ %23:sreg_32 = S_MOV_B32 4660
+ %24:sreg_32 = S_MOV_B32 1450743926
+ %25:sreg_64 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1
+ %26:sreg_64 = S_SUB_U64_PSEUDO killed %25, killed %22, implicit-def dead $scc
+ %27:vreg_64 = COPY %26
+ BUFFER_STORE_DWORDX2_OFFSET killed %27, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: vsub64rr
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 23, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 25, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$vgpr0', virtual-reg: '%0' }
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 16
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %0:vgpr_32(s32) = COPY $vgpr0
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+ %15:sreg_32 = COPY %14.sub1
+ %16:sreg_32 = COPY %14.sub0
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_32 = S_MOV_B32 -1
+ %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, killed %18, %subreg.sub2, killed %17, %subreg.sub3
+ %20:sreg_32 = COPY %11.sub3
+ %21:sreg_32 = COPY %11.sub2
+ %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+ %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %24:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %23, %subreg.sub1
+ %25:vreg_64 = V_SUB_U64_PSEUDO killed %22, killed %24, implicit-def dead $vcc, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFSET killed %25, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.1, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: vsub64ri
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 17, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 22, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$vgpr0', virtual-reg: '%0' }
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 8
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %0:vgpr_32(s32) = COPY $vgpr0
+ %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_32 = S_MOV_B32 61440
+ %15:sreg_32 = S_MOV_B32 -1
+ %16:sgpr_128 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1, killed %15, %subreg.sub2, killed %14, %subreg.sub3
+ %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %18:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %17, %subreg.sub1
+ %19:sreg_32 = S_MOV_B32 4660
+ %20:sreg_32 = S_MOV_B32 1450743926
+ %21:sreg_64 = REG_SEQUENCE killed %20, %subreg.sub0, killed %19, %subreg.sub1
+ %22:vreg_64 = V_SUB_U64_PSEUDO killed %21, killed %18, implicit-def dead $vcc, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFSET killed %22, killed %16, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: susubo32
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ %13:sreg_32 = COPY %11.sub1
+ %14:sreg_32 = COPY %11.sub0
+ %15:sreg_32 = S_MOV_B32 61440
+ %16:sreg_32 = S_MOV_B32 -1
+ %17:sgpr_128 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1, killed %16, %subreg.sub2, killed %15, %subreg.sub3
+ %18:sreg_32 = COPY %12.sub0
+ %19:sreg_32 = COPY %12.sub1
+ %20:sreg_32 = S_SUB_I32 killed %18, killed %19, implicit-def dead $scc
+ %21:vgpr_32 = COPY %20
+ BUFFER_STORE_DWORD_OFFSET killed %21, killed %17, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: usubo32_vcc_user
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 28, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 29, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 30, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 31, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 32, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ %13:sreg_32 = COPY %11.sub1
+ %14:sreg_32 = COPY %11.sub0
+ %15:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+ %16:sreg_32 = COPY %15.sub1
+ %17:sreg_32 = COPY %15.sub0
+ %18:sreg_32 = S_MOV_B32 61440
+ %19:sreg_32 = S_MOV_B32 -1
+ %20:sgpr_128 = REG_SEQUENCE killed %17, %subreg.sub0, killed %16, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+ %21:sreg_32 = COPY %11.sub3
+ %22:sreg_32 = COPY %11.sub2
+ %23:sreg_64 = REG_SEQUENCE killed %22, %subreg.sub0, killed %21, %subreg.sub1
+ %24:sreg_32 = COPY %23.sub1
+ %25:sreg_32 = COPY %23.sub0
+ %26:sgpr_128 = REG_SEQUENCE killed %25, %subreg.sub0, killed %24, %subreg.sub1, %19, %subreg.sub2, %18, %subreg.sub3
+ %27:sreg_32 = COPY %12.sub0
+ %28:sreg_32 = COPY %12.sub1
+ %31:vgpr_32 = COPY killed %28
+ %29:vgpr_32, %30:sreg_64_xexec = V_SUB_CO_U32_e64 killed %27, %31, 0, implicit $exec
+ BUFFER_STORE_DWORD_OFFSET killed %29, killed %20, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+ %32:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %30, implicit $exec
+ BUFFER_STORE_BYTE_OFFSET killed %32, killed %26, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: susubo64
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_256, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 28, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 29, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 30, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 31, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 32, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 33, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 34, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 35, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 36, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 32
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %11:sgpr_256 = S_LOAD_DWORDX8_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s256) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_32 = COPY %11.sub1
+ %13:sreg_32 = COPY %11.sub0
+ %14:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, killed %12, %subreg.sub1
+ %15:sreg_32 = COPY %14.sub1
+ %16:sreg_32 = COPY %14.sub0
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_32 = S_MOV_B32 -1
+ %19:sgpr_128 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+ %20:sreg_32 = COPY %11.sub3
+ %21:sreg_32 = COPY %11.sub2
+ %22:sreg_64 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1
+ %23:sreg_32 = COPY %22.sub1
+ %24:sreg_32 = COPY %22.sub0
+ %25:sgpr_128 = REG_SEQUENCE killed %24, %subreg.sub0, killed %23, %subreg.sub1, %18, %subreg.sub2, %17, %subreg.sub3
+ %26:sreg_32 = COPY %11.sub5
+ %27:sreg_32 = COPY %11.sub4
+ %28:sreg_64 = REG_SEQUENCE killed %27, %subreg.sub0, killed %26, %subreg.sub1
+ %29:sreg_32 = COPY %11.sub7
+ %30:sreg_32 = COPY %11.sub6
+ %31:sreg_64 = REG_SEQUENCE killed %30, %subreg.sub0, killed %29, %subreg.sub1
+ %33:vreg_64 = COPY %31
+ %32:sreg_64_xexec = V_CMP_GT_U64_e64 %28, %33, implicit $exec
+ %34:sreg_64 = S_SUB_U64_PSEUDO %28, %31, implicit-def dead $scc
+ %35:vreg_64 = COPY %34
+ BUFFER_STORE_DWORDX2_OFFSET killed %35, killed %19, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+ %36:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %32, implicit $exec
+ BUFFER_STORE_BYTE_OFFSET killed %36, killed %25, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: vusubo64
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 1, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 2, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 8, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 28, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 29, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 30, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 31, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 32, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 33, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 34, class: vgpr_32, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$vgpr0', virtual-reg: '%0' }
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%5' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %5:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %0:vgpr_32(s32) = COPY $vgpr0
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM %5(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+ %13:sreg_32 = COPY %12.sub1
+ %14:sreg_32 = COPY %12.sub0
+ %15:sreg_32 = COPY %11.sub3
+ %16:sreg_32 = COPY %11.sub2
+ %17:sreg_32 = COPY %11.sub1
+ %18:sreg_32 = COPY %11.sub0
+ %19:sreg_64 = REG_SEQUENCE killed %18, %subreg.sub0, killed %17, %subreg.sub1
+ %20:sreg_32 = COPY %19.sub1
+ %21:sreg_32 = COPY %19.sub0
+ %22:sreg_32 = S_MOV_B32 61440
+ %23:sreg_32 = S_MOV_B32 -1
+ %24:sgpr_128 = REG_SEQUENCE killed %21, %subreg.sub0, killed %20, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+ %25:sreg_64 = REG_SEQUENCE killed %16, %subreg.sub0, killed %15, %subreg.sub1
+ %26:sreg_32 = COPY %25.sub1
+ %27:sreg_32 = COPY %25.sub0
+ %28:sgpr_128 = REG_SEQUENCE killed %27, %subreg.sub0, killed %26, %subreg.sub1, %23, %subreg.sub2, %22, %subreg.sub3
+ %29:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, killed %13, %subreg.sub1
+ %30:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %31:vreg_64 = REG_SEQUENCE %0(s32), %subreg.sub0, killed %30, %subreg.sub1
+ %32:sreg_64_xexec = V_CMP_GT_U64_e64 %29, %31, implicit $exec
+ %33:vreg_64 = V_SUB_U64_PSEUDO %29, %31, implicit-def dead $vcc, implicit $exec
+ BUFFER_STORE_DWORDX2_OFFSET killed %33, killed %24, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.2, addrspace 1)
+ %34:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %32, implicit $exec
+ BUFFER_STORE_BYTE_OFFSET killed %34, killed %28, 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: sudiv64
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: false
+isSSA: true
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: sgpr_192, preferred-register: '', flags: [ ] }
+ - { id: 1, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 2, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 3, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 4, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 5, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 6, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 7, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 8, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 9, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 10, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 11, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 12, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 13, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 14, class: sgpr_64, preferred-register: '', flags: [ ] }
+ - { id: 15, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 16, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 17, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 18, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 19, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 20, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 21, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 22, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 23, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 24, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 25, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 26, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 27, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 28, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 29, class: sgpr_192, preferred-register: '', flags: [ ] }
+ - { id: 30, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 31, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 32, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 33, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 34, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 35, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 36, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 37, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 38, class: vreg_64, preferred-register: '', flags: [ ] }
+ - { id: 39, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 40, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 41, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 42, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 43, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 44, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 45, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 46, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 47, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 48, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 49, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 50, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 51, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 52, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 53, class: sgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 54, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 55, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 56, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 57, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 58, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 59, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 60, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 61, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 62, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 63, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 64, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 65, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 66, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 67, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 68, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 69, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 70, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 71, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 72, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 73, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 74, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 75, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 76, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 77, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 78, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 79, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 80, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 81, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 82, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 83, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 84, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 85, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 86, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 87, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 88, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 89, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 90, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 91, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 92, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 93, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 94, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 95, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 96, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 97, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 98, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 99, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 100, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 101, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 102, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 103, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 104, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 105, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 106, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 107, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 108, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 109, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 110, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 111, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 112, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 113, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 114, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 115, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 116, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 117, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 118, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 119, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 120, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 121, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 122, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 123, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 124, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 125, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 126, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 127, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 128, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 129, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 130, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 131, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 132, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 133, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 134, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 135, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 136, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 137, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 138, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 139, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 140, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 141, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 142, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 143, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 144, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 145, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 146, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 147, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 148, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 149, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 150, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 151, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 152, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 153, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 154, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 155, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 156, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 157, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 158, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 159, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 160, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 161, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 162, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 163, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 164, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 165, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 166, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 167, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 168, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 169, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 170, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 171, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 172, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 173, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 174, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 175, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 176, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 177, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 178, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 179, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 180, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 181, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 182, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 183, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 184, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 185, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 186, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 187, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 188, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 189, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 190, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 191, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 192, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 193, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 194, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 195, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 196, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 197, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 198, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 199, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 200, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 201, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 202, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 203, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 204, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 205, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 206, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 207, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 208, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 209, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 210, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 211, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 212, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 213, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 214, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 215, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 216, class: sreg_64_xexec, preferred-register: '', flags: [ ] }
+ - { id: 217, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 218, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 219, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 220, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 221, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 222, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 223, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 224, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 225, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 226, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 227, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 228, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 229, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 230, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 231, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 232, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 233, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 234, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 235, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 236, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 237, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 238, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 239, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 240, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 241, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 242, class: vgpr_32, preferred-register: '', flags: [ ] }
+ - { id: 243, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 244, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 245, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 246, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 247, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 248, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 249, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 250, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 251, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 252, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 253, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 254, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 255, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 256, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 257, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 258, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 259, class: sreg_64, preferred-register: '', flags: [ ] }
+ - { id: 260, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 261, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 262, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 263, class: sreg_32, preferred-register: '', flags: [ ] }
+ - { id: 264, class: sgpr_128, preferred-register: '', flags: [ ] }
+ - { id: 265, class: vreg_64, preferred-register: '', flags: [ ] }
+liveins:
+ - { reg: '$sgpr4_sgpr5', virtual-reg: '%13' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ explicitKernArgSize: 24
+ maxKernArgAlign: 8
+ ldsSize: 0
+ gdsSize: 0
+ dynLDSAlign: 1
+ isEntryFunction: true
+ isChainFunction: false
+ noSignedZerosFPMath: false
+ memoryBound: false
+ waveLimiter: false
+ hasSpilledSGPRs: false
+ hasSpilledVGPRs: false
+ numWaveDispatchSGPRs: 0
+ numWaveDispatchVGPRs: 0
+ scratchRSrcReg: '$private_rsrc_reg'
+ frameOffsetReg: '$fp_reg'
+ stackPtrOffsetReg: '$sp_reg'
+ bytesInStackArgArea: 0
+ returnsVoid: true
+ argumentInfo:
+ dispatchPtr: { reg: '$sgpr0_sgpr1' }
+ queuePtr: { reg: '$sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ dispatchID: { reg: '$sgpr6_sgpr7' }
+ workGroupIDX: { reg: '$sgpr8' }
+ workGroupIDY: { reg: '$sgpr9' }
+ workGroupIDZ: { reg: '$sgpr10' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr11' }
+ workItemIDX: { reg: '$vgpr0' }
+ workItemIDY: { reg: '$vgpr1' }
+ workItemIDZ: { reg: '$vgpr2' }
+ psInputAddr: 0
+ psInputEnable: 0
+ maxMemoryClusterDWords: 8
+ mode:
+ ieee: true
+ dx10-clamp: true
+ fp32-input-denormals: true
+ fp32-output-denormals: true
+ fp64-fp16-input-denormals: true
+ fp64-fp16-output-denormals: true
+ highBitsOf32BitAddress: 0
+ occupancy: 10
+ vgprForAGPRCopy: ''
+ sgprForEXECCopy: ''
+ longBranchReservedReg: ''
+ hasInitWholeWave: false
+ dynamicVGPRBlockSize: 0
+ scratchReservedForDynamicVGPRs: 0
+ numKernargPreloadSGPRs: 0
+ isWholeWaveFunction: false
+body: |
+ bb.0 (%ir-block.0):
+ successors: %bb.3(0x50000000), %bb.1(0x30000000)
+ liveins: $sgpr4_sgpr5
+
+ %13:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %21:sgpr_128 = S_LOAD_DWORDX4_IMM %13(p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ %22:sreg_64_xexec = S_LOAD_DWORDX2_IMM %13(p4), 13, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset + 16, align 4, addrspace 4)
+ %23:sreg_32 = COPY %22.sub1
+ %24:sreg_32 = COPY %22.sub0
+ %25:sreg_32 = COPY %21.sub3
+ %26:sreg_32 = COPY %21.sub2
+ %27:sreg_32 = COPY %21.sub1
+ %28:sreg_32 = COPY %21.sub0
+ %29:sgpr_192 = REG_SEQUENCE killed %28, %subreg.sub0, killed %27, %subreg.sub1, %26, %subreg.sub2, %25, %subreg.sub3, %24, %subreg.sub4, %23, %subreg.sub5
+ %0:sgpr_192 = COPY %29
+ %30:sreg_64 = REG_SEQUENCE %26, %subreg.sub0, %25, %subreg.sub1
+ %1:sreg_64 = COPY %30
+ %31:sreg_64 = REG_SEQUENCE %24, %subreg.sub0, %23, %subreg.sub1
+ %2:sreg_64 = COPY %31
+ %32:sreg_64 = S_OR_B64 %30, %31, implicit-def dead $scc
+ %33:sreg_32 = COPY %32.sub1
+ %34:sreg_32 = S_MOV_B32 0
+ %35:sreg_64 = REG_SEQUENCE killed %34, %subreg.sub0, killed %33, %subreg.sub1
+ %36:sreg_64 = S_MOV_B64 0
+ %38:vreg_64 = COPY killed %36
+ %37:sreg_64 = V_CMP_NE_U64_e64 killed %35, %38, implicit $exec
+ %20:sreg_64 = S_MOV_B64 -1
+ %19:sreg_64 = IMPLICIT_DEF
+ %39:sreg_64 = S_AND_B64 $exec, killed %37, implicit-def dead $scc
+ $vcc = COPY %39
+ S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+ S_BRANCH %bb.1
+
+ bb.1.Flow:
+ successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+ %3:sreg_64 = PHI %19, %bb.0, %6, %bb.3
+ %4:sreg_64_xexec = PHI %20, %bb.0, %40, %bb.3
+ %224:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %4, implicit $exec
+ %225:sreg_32 = S_MOV_B32 1
+ %226:sreg_32 = COPY %224
+ S_CMP_LG_U32 killed %226, killed %225, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.4, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2 (%ir-block.7):
+ successors: %bb.4(0x80000000)
+
+ %227:sreg_32 = COPY %2.sub0
+ %228:sreg_32 = COPY %1.sub0
+ %229:sreg_32 = S_MOV_B32 0
+ %230:sreg_32 = S_SUB_I32 killed %229, %227, implicit-def dead $scc
+ %231:vgpr_32 = V_CVT_F32_U32_e32 %227, implicit $mode, implicit $exec
+ %232:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e32 killed %231, implicit $mode, implicit $exec
+ %233:vgpr_32 = nofpexcept V_MUL_F32_e32 1333788670, killed %232, implicit $mode, implicit $exec
+ %234:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 killed %233, implicit $mode, implicit $exec
+ %236:sreg_32 = COPY %234
+ %235:sreg_32 = S_MUL_I32 killed %230, %236
+ %237:vgpr_32 = V_MUL_HI_U32_e64 %234, killed %235, implicit $exec
+ %239:sreg_32 = COPY %234
+ %240:sreg_32 = COPY %237
+ %238:sreg_32 = S_ADD_I32 %239, killed %240, implicit-def dead $scc
+ %242:vgpr_32 = COPY killed %238
+ %241:vgpr_32 = V_MUL_HI_U32_e64 %228, %242, implicit $exec
+ %243:sreg_32 = S_MOV_B32 1
+ %245:sreg_32 = COPY %241
+ %244:sreg_32 = S_ADD_I32 %245, %243, implicit-def dead $scc
+ %247:sreg_32 = COPY %241
+ %246:sreg_32 = S_MUL_I32 %247, %227
+ %248:sreg_32 = S_SUB_I32 %228, killed %246, implicit-def dead $scc
+ %249:sreg_32 = S_SUB_I32 %248, %227, implicit-def dead $scc
+ S_CMP_GE_U32 %248, %227, implicit-def $scc
+ %250:sreg_32 = S_CSELECT_B32 killed %249, %248, implicit $scc
+ %252:sreg_32 = COPY %241
+ %251:sreg_32 = S_CSELECT_B32 killed %244, %252, implicit $scc
+ %253:sreg_32 = S_ADD_I32 %251, %243, implicit-def dead $scc
+ S_CMP_GE_U32 killed %250, %227, implicit-def $scc
+ %254:sreg_32 = S_CSELECT_B32 killed %253, %251, implicit $scc
+ %255:sreg_32 = S_MOV_B32 0
+ %256:sreg_64 = REG_SEQUENCE killed %254, %subreg.sub0, killed %255, %subreg.sub1
+ %5:sreg_64 = COPY %256
+ S_BRANCH %bb.4
+
+ bb.3 (%ir-block.12):
+ successors: %bb.1(0x80000000)
+
+ %41:sreg_32 = COPY %2.sub0
+ %42:vgpr_32 = V_CVT_F32_U32_e64 %41, 0, 0, implicit $mode, implicit $exec
+ %43:sreg_32 = COPY %2.sub1
+ %44:vgpr_32 = V_CVT_F32_U32_e64 %43, 0, 0, implicit $mode, implicit $exec
+ %45:sgpr_32 = S_MOV_B32 1333788672
+ %46:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %44, 0, killed %45, 0, killed %42, 0, 0, implicit $mode, implicit $exec
+ %47:vgpr_32 = nofpexcept V_RCP_F32_e64 0, killed %46, 0, 0, implicit $mode, implicit $exec
+ %48:sgpr_32 = S_MOV_B32 1602224124
+ %49:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %47, 0, killed %48, 0, 0, implicit $mode, implicit $exec
+ %50:sgpr_32 = S_MOV_B32 796917760
+ %51:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %49, 0, killed %50, 0, 0, implicit $mode, implicit $exec
+ %52:vgpr_32 = nofpexcept V_TRUNC_F32_e64 0, killed %51, 0, 0, implicit $mode, implicit $exec
+ %53:sgpr_32 = S_MOV_B32 -813694976
+ %54:vgpr_32 = nofpexcept V_FMA_F32_e64 0, %52, 0, killed %53, 0, %49, 0, 0, implicit $mode, implicit $exec
+ %55:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed %54, 0, 0, implicit $mode, implicit $exec
+ %56:sreg_64 = S_MOV_B64 0
+ %57:sreg_64 = S_SUB_U64_PSEUDO killed %56, %2, implicit-def dead $scc
+ %58:sreg_32 = COPY %57.sub1
+ %60:sreg_32 = COPY %55
+ %59:sreg_32 = S_MUL_I32 %58, %60
+ %61:sreg_32 = COPY %57.sub0
+ %62:vgpr_32 = V_MUL_HI_U32_e64 %61, %55, implicit $exec
+ %63:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, %52, 0, 0, implicit $mode, implicit $exec
+ %65:sreg_32 = COPY %63
+ %64:sreg_32 = S_MUL_I32 %61, %65
+ %67:sreg_32 = COPY %62
+ %66:sreg_32 = S_ADD_I32 killed %67, killed %64, implicit-def dead $scc
+ %68:sreg_32 = S_ADD_I32 killed %66, killed %59, implicit-def dead $scc
+ %69:vgpr_32 = V_MUL_HI_U32_e64 %55, %68, implicit $exec
+ %71:sreg_32 = COPY %55
+ %70:sreg_32 = S_MUL_I32 %71, %68
+ %72:sreg_64 = REG_SEQUENCE killed %70, %subreg.sub0, killed %69, %subreg.sub1
+ %74:sreg_32 = COPY %55
+ %73:sreg_32 = S_MUL_I32 %61, %74
+ %75:vgpr_32 = V_MUL_HI_U32_e64 %55, %73, implicit $exec
+ %76:sreg_32 = S_MOV_B32 0
+ %77:sreg_64 = REG_SEQUENCE killed %75, %subreg.sub0, %76, %subreg.sub1
+ %78:sreg_64 = S_ADD_U64_PSEUDO killed %77, killed %72, implicit-def dead $scc
+ %79:sreg_32 = COPY %78.sub0
+ %80:sreg_32 = COPY %78.sub1
+ %81:vgpr_32 = V_MUL_HI_U32_e64 %63, %68, implicit $exec
+ %82:vgpr_32 = V_MUL_HI_U32_e64 %63, %73, implicit $exec
+ %84:sreg_32 = COPY %63
+ %83:sreg_32 = S_MUL_I32 %84, %73
+ %85:sreg_64 = REG_SEQUENCE killed %83, %subreg.sub0, killed %82, %subreg.sub1
+ %86:sreg_32 = COPY %85.sub0
+ %87:sreg_32 = COPY %85.sub1
+ %88:sreg_32 = S_MOV_B32 0
+ %89:sreg_32 = S_ADD_U32 killed %79, killed %86, implicit-def $scc
+ %90:sreg_32 = S_ADDC_U32 killed %80, killed %87, implicit-def $scc, implicit $scc
+ %92:sreg_32 = COPY %81
+ %91:sreg_32 = S_ADDC_U32 killed %92, %88, implicit-def dead $scc, implicit $scc
+ %94:sreg_32 = COPY %63
+ %93:sreg_32 = S_MUL_I32 %94, %68
+ %95:sreg_64 = REG_SEQUENCE killed %93, %subreg.sub0, killed %91, %subreg.sub1
+ %96:sreg_64 = REG_SEQUENCE killed %89, %subreg.sub0, killed %90, %subreg.sub1
+ %97:sreg_32 = COPY %96.sub1
+ %98:sreg_64 = REG_SEQUENCE killed %97, %subreg.sub0, %88, %subreg.sub1
+ %99:sreg_64 = S_ADD_U64_PSEUDO killed %98, killed %95, implicit-def dead $scc
+ %100:sreg_32 = COPY %99.sub0
+ %103:sreg_32 = COPY %55
+ %101:sreg_32, %102:sreg_64_xexec = S_UADDO_PSEUDO %103, killed %100, implicit-def dead $scc
+ %104:sreg_32 = COPY %99.sub1
+ %107:sreg_32 = COPY %63
+ %105:sreg_32, %106:sreg_64_xexec = S_ADD_CO_PSEUDO %107, killed %104, killed %102, implicit-def dead $scc
+ %108:sreg_32 = S_MUL_I32 %61, %105
+ %110:vgpr_32 = COPY %101
+ %109:vgpr_32 = V_MUL_HI_U32_e64 %61, %110, implicit $exec
+ %112:sreg_32 = COPY %109
+ %111:sreg_32 = S_ADD_I32 killed %112, killed %108, implicit-def dead $scc
+ %113:sreg_32 = S_MUL_I32 %58, %101
+ %114:sreg_32 = S_ADD_I32 killed %111, killed %113, implicit-def dead $scc
+ %116:vgpr_32 = COPY %114
+ %115:vgpr_32 = V_MUL_HI_U32_e64 %105, %116, implicit $exec
+ %117:sreg_32 = S_MUL_I32 %61, %101
+ %119:vgpr_32 = COPY %117
+ %118:vgpr_32 = V_MUL_HI_U32_e64 %105, %119, implicit $exec
+ %120:sreg_32 = S_MUL_I32 %105, %117
+ %121:sreg_64 = REG_SEQUENCE killed %120, %subreg.sub0, killed %118, %subreg.sub1
+ %122:sreg_32 = COPY %121.sub0
+ %123:sreg_32 = COPY %121.sub1
+ %125:vgpr_32 = COPY %114
+ %124:vgpr_32 = V_MUL_HI_U32_e64 %101, %125, implicit $exec
+ %126:sreg_32 = S_MUL_I32 %101, %114
+ %127:sreg_64 = REG_SEQUENCE killed %126, %subreg.sub0, killed %124, %subreg.sub1
+ %129:vgpr_32 = COPY %117
+ %128:vgpr_32 = V_MUL_HI_U32_e64 %101, %129, implicit $exec
+ %130:sreg_64 = REG_SEQUENCE killed %128, %subreg.sub0, %76, %subreg.sub1
+ %131:sreg_64 = S_ADD_U64_PSEUDO killed %130, killed %127, implicit-def dead $scc
+ %132:sreg_32 = COPY %131.sub0
+ %133:sreg_32 = COPY %131.sub1
+ %134:sreg_32 = S_ADD_U32 killed %132, killed %122, implicit-def $scc
+ %135:sreg_32 = S_ADDC_U32 killed %133, killed %123, implicit-def $scc, implicit $scc
+ %137:sreg_32 = COPY %115
+ %136:sreg_32 = S_ADDC_U32 killed %137, %88, implicit-def dead $scc, implicit $scc
+ %138:sreg_32 = S_MUL_I32 %105, %114
+ %139:sreg_64 = REG_SEQUENCE killed %138, %subreg.sub0, killed %136, %subreg.sub1
+ %140:sreg_64 = REG_SEQUENCE killed %134, %subreg.sub0, killed %135, %subreg.sub1
+ %141:sreg_32 = COPY %140.sub1
+ %142:sreg_64 = REG_SEQUENCE killed %141, %subreg.sub0, %88, %subreg.sub1
+ %143:sreg_64 = S_ADD_U64_PSEUDO killed %142, killed %139, implicit-def dead $scc
+ %144:sreg_32 = COPY %143.sub0
+ %145:sreg_32, %146:sreg_64_xexec = S_UADDO_PSEUDO %101, killed %144, implicit-def dead $scc
+ %147:sreg_32 = COPY %143.sub1
+ %148:sreg_32, %149:sreg_64_xexec = S_ADD_CO_PSEUDO %105, killed %147, killed %146, implicit-def dead $scc
+ %150:sreg_32 = COPY %1.sub0
+ %152:vgpr_32 = COPY %148
+ %151:vgpr_32 = V_MUL_HI_U32_e64 %150, %152, implicit $exec
+ %153:sreg_32 = S_MUL_I32 %150, %148
+ %154:sreg_64 = REG_SEQUENCE killed %153, %subreg.sub0, killed %151, %subreg.sub1
+ %156:vgpr_32 = COPY %145
+ %155:vgpr_32 = V_MUL_HI_U32_e64 %150, %156, implicit $exec
+ %157:sreg_64 = REG_SEQUENCE killed %155, %subreg.sub0, %76, %subreg.sub1
+ %158:sreg_64 = S_ADD_U64_PSEUDO killed %157, killed %154, implicit-def dead $scc
+ %159:sreg_32 = COPY %158.sub0
+ %160:sreg_32 = COPY %158.sub1
+ %161:sreg_32 = COPY %1.sub1
+ %163:vgpr_32 = COPY %148
+ %162:vgpr_32 = V_MUL_HI_U32_e64 %161, %163, implicit $exec
+ %165:vgpr_32 = COPY %145
+ %164:vgpr_32 = V_MUL_HI_U32_e64 %161, %165, implicit $exec
+ %166:sreg_32 = S_MUL_I32 %161, %145
+ %167:sreg_64 = REG_SEQUENCE killed %166, %subreg.sub0, killed %164, %subreg.sub1
+ %168:sreg_32 = COPY %167.sub0
+ %169:sreg_32 = COPY %167.sub1
+ %170:sreg_32 = S_ADD_U32 killed %159, killed %168, implicit-def $scc
+ %171:sreg_32 = S_ADDC_U32 killed %160, killed %169, implicit-def $scc, implicit $scc
+ %173:sreg_32 = COPY %162
+ %172:sreg_32 = S_ADDC_U32 killed %173, %88, implicit-def dead $scc, implicit $scc
+ %174:sreg_32 = S_MUL_I32 %161, %148
+ %175:sreg_64 = REG_SEQUENCE killed %174, %subreg.sub0, killed %172, %subreg.sub1
+ %176:sreg_64 = REG_SEQUENCE killed %170, %subreg.sub0, killed %171, %subreg.sub1
+ %177:sreg_32 = COPY %176.sub1
+ %178:sreg_64 = REG_SEQUENCE killed %177, %subreg.sub0, %88, %subreg.sub1
+ %179:sreg_64 = S_ADD_U64_PSEUDO killed %178, killed %175, implicit-def dead $scc
+ %180:sreg_32 = COPY %179.sub1
+ %181:sreg_32 = S_MUL_I32 %41, %180
+ %182:sreg_32 = COPY %179.sub0
+ %184:vgpr_32 = COPY %182
+ %183:vgpr_32 = V_MUL_HI_U32_e64 %41, %184, implicit $exec
+ %186:sreg_32 = COPY %183
+ %185:sreg_32 = S_ADD_I32 killed %186, killed %181, implicit-def dead $scc
+ %187:sreg_32 = S_MUL_I32 %43, %182
+ %188:sreg_32 = S_ADD_I32 killed %185, killed %187, implicit-def dead $scc
+ %189:sreg_32 = S_SUB_I32 %161, %188, implicit-def dead $scc
+ %190:sreg_32 = S_MUL_I32 %41, %182
+ %191:sreg_32, %192:sreg_64_xexec = S_USUBO_PSEUDO %150, killed %190, implicit-def dead $scc
+ %193:sreg_32, %194:sreg_64_xexec = S_SUB_CO_PSEUDO killed %189, %43, %192, implicit-def dead $scc
+ %195:sreg_32, %196:sreg_64_xexec = S_USUBO_PSEUDO %191, %41, implicit-def dead $scc
+ %197:sreg_32, %198:sreg_64_xexec = S_SUB_CO_PSEUDO killed %193, %88, killed %196, implicit-def dead $scc
+ S_CMP_GE_U32 %197, %43, implicit-def $scc
+ %199:sreg_32 = S_MOV_B32 -1
+ %200:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+ S_CMP_GE_U32 killed %195, %41, implicit-def $scc
+ %201:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+ S_CMP_EQ_U32 %197, %43, implicit-def $scc
+ %202:sreg_32 = S_CSELECT_B32 killed %201, killed %200, implicit $scc
+ %203:sreg_32 = COPY killed %202
+ %204:sreg_64 = REG_SEQUENCE %182, %subreg.sub0, %180, %subreg.sub1
+ %205:sreg_64 = S_MOV_B64 1
+ %206:sreg_64 = S_ADD_U64_PSEUDO %204, killed %205, implicit-def dead $scc
+ %207:sreg_32 = COPY %206.sub0
+ %208:sreg_64 = S_MOV_B64 2
+ %209:sreg_64 = S_ADD_U64_PSEUDO %204, killed %208, implicit-def dead $scc
+ %210:sreg_32 = COPY %209.sub0
+ S_CMP_LG_U32 killed %203, %88, implicit-def $scc
+ %211:sreg_32 = S_CSELECT_B32 killed %210, killed %207, implicit $scc
+ %212:sreg_32 = COPY %206.sub1
+ %213:sreg_32 = COPY %209.sub1
+ %214:sreg_32 = S_CSELECT_B32 killed %213, killed %212, implicit $scc
+ %215:sreg_32, %216:sreg_64_xexec = S_SUB_CO_PSEUDO %161, %188, %192, implicit-def dead $scc
+ S_CMP_GE_U32 %215, %43, implicit-def $scc
+ %217:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+ S_CMP_GE_U32 %191, %41, implicit-def $scc
+ %218:sreg_32 = S_CSELECT_B32 %199, %88, implicit $scc
+ S_CMP_EQ_U32 %215, %43, implicit-def $scc
+ %219:sreg_32 = S_CSELECT_B32 killed %218, killed %217, implicit $scc
+ %220:sreg_32 = COPY killed %219
+ S_CMP_LG_U32 killed %220, %88, implicit-def $scc
+ %221:sreg_32 = S_CSELECT_B32 killed %214, %180, implicit $scc
+ %222:sreg_32 = S_CSELECT_B32 killed %211, %182, implicit $scc
+ %223:sreg_64 = REG_SEQUENCE killed %222, %subreg.sub0, killed %221, %subreg.sub1
+ %40:sreg_64 = S_MOV_B64 0
+ %6:sreg_64 = COPY %223
+ S_BRANCH %bb.1
+
+ bb.4 (%ir-block.14):
+ %7:sreg_64 = PHI %3, %bb.1, %5, %bb.2
+ %257:sreg_32 = COPY %0.sub1
+ %258:sreg_32 = COPY %0.sub0
+ %259:sreg_64 = REG_SEQUENCE killed %258, %subreg.sub0, killed %257, %subreg.sub1
+ %260:sreg_32 = COPY %259.sub1
+ %261:sreg_32 = COPY %259.sub0
+ %262:sreg_32 = S_MOV_B32 61440
+ %263:sreg_32 = S_MOV_B32 -1
+ %264:sgpr_128 = REG_SEQUENCE killed %261, %subreg.sub0, killed %260, %subreg.sub1, killed %263, %subreg.sub2, killed %262, %subreg.sub3
+ %265:vreg_64 = COPY %7
+ BUFFER_STORE_DWORDX2_OFFSET %265, killed %264, 0, 0, 0, 0, implicit $exec :: (store (s64) into %ir.16, addrspace 1)
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index b000fae124ede..b3025279081c2 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -665,18 +665,18 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_ushort v4, v[2:3]
+; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v5
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5
; VI-NEXT: v_cmp_lt_u32_e32 vcc, v6, v4
@@ -690,10 +690,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
-; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX9-NEXT: global_load_ushort v1, v0, s[14:15]
+; GFX9-NEXT: global_load_ushort v2, v0, s[12:13]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, v1, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
; GFX9-NEXT: v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX9-NEXT: global_store_short v0, v2, s[8:9]
@@ -706,10 +706,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
-; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT: global_load_ushort v1, v0, s[14:15]
+; GFX10-NEXT: global_load_ushort v2, v0, s[12:13]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1
; GFX10-NEXT: v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: global_store_short v0, v2, s[8:9]
@@ -722,10 +722,10 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[6:7]
+; GFX11-NEXT: global_load_u16 v2, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index e5789de4ca415..62a8e97d979b0 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -10,14 +10,14 @@ define <2 x i1> @uaddo(ptr %ptr, ptr %ptr2) {
; CHECK-LABEL: uaddo:
; CHECK: @ %bb.0:
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT: vmov r3, r2, d18
-; CHECK-NEXT: vadd.i64 q8, q9, q8
; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: vmov r6, r7, d19
-; CHECK-NEXT: vmov lr, r12, d16
-; CHECK-NEXT: vmov r4, r5, d17
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vmov r3, r2, d16
+; CHECK-NEXT: vadd.i64 q9, q9, q8
+; CHECK-NEXT: vmov r6, r7, d17
+; CHECK-NEXT: vmov lr, r12, d18
+; CHECK-NEXT: vmov r4, r5, d19
; CHECK-NEXT: subs.w r3, lr, r3
; CHECK-NEXT: sbcs.w r2, r12, r2
; CHECK-NEXT: mov.w r2, #0
@@ -33,7 +33,7 @@ define <2 x i1> @uaddo(ptr %ptr, ptr %ptr2) {
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: it ne
; CHECK-NEXT: movne.w r1, #-1
-; CHECK-NEXT: vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
%x = load <2 x i64>, ptr %ptr, align 8
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index 34b703a981105..78af50af71dee 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -113,10 +113,10 @@ define i32 @unsigned_sat_constant_i32_using_min(i32 %x) {
define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
; CHECK-LABEL: unsigned_sat_constant_i32_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi 4, 3, 42
-; CHECK-NEXT: cmplw 4, 3
-; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: isellt 3, 3, 4
+; CHECK-NEXT: addi 3, 3, 42
+; CHECK-NEXT: li 4, -1
+; CHECK-NEXT: cmplwi 3, 42
+; CHECK-NEXT: isellt 3, 4, 3
; CHECK-NEXT: blr
%a = add i32 %x, 42
%c = icmp ugt i32 %x, %a
@@ -303,10 +303,10 @@ define i32 @unsigned_sat_variable_i32_using_min(i32 %x, i32 %y) {
define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: add 4, 3, 4
-; CHECK-NEXT: cmplw 4, 3
-; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: isellt 3, 3, 4
+; CHECK-NEXT: add 3, 3, 4
+; CHECK-NEXT: cmplw 3, 4
+; CHECK-NEXT: li 4, -1
+; CHECK-NEXT: isellt 3, 4, 3
; CHECK-NEXT: blr
%a = add i32 %x, %y
%c = icmp ugt i32 %x, %a
diff --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll
index 153c97faddec8..de1193fde98f8 100644
--- a/llvm/test/CodeGen/RISCV/addcarry.ll
+++ b/llvm/test/CodeGen/RISCV/addcarry.ll
@@ -16,7 +16,7 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
; RISCV32-NEXT: add a7, a7, t0
; RISCV32-NEXT: add a4, t1, a6
; RISCV32-NEXT: sltu a5, t1, a5
-; RISCV32-NEXT: sltu a6, a4, t1
+; RISCV32-NEXT: sltu a6, a4, a6
; RISCV32-NEXT: add a5, a7, a5
; RISCV32-NEXT: add a5, a5, a6
; RISCV32-NEXT: mul a6, a1, a3
@@ -45,16 +45,16 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
define { i32, i32, i1 } @addcarry_2x32(i32 %x0, i32 %x1, i32 %y0, i32 %y1) nounwind {
; RISCV32-LABEL: addcarry_2x32:
; RISCV32: # %bb.0:
-; RISCV32-NEXT: add a3, a1, a3
-; RISCV32-NEXT: add a4, a2, a4
-; RISCV32-NEXT: sltu a1, a3, a1
-; RISCV32-NEXT: sltu a2, a4, a2
-; RISCV32-NEXT: add a1, a4, a1
-; RISCV32-NEXT: sltu a4, a1, a4
-; RISCV32-NEXT: or a2, a2, a4
-; RISCV32-NEXT: sw a3, 0(a0)
-; RISCV32-NEXT: sw a1, 4(a0)
-; RISCV32-NEXT: sb a2, 8(a0)
+; RISCV32-NEXT: add a1, a1, a3
+; RISCV32-NEXT: add a2, a2, a4
+; RISCV32-NEXT: sltu a3, a1, a3
+; RISCV32-NEXT: sltu a4, a2, a4
+; RISCV32-NEXT: add a2, a2, a3
+; RISCV32-NEXT: sltu a3, a2, a3
+; RISCV32-NEXT: or a3, a4, a3
+; RISCV32-NEXT: sw a1, 0(a0)
+; RISCV32-NEXT: sw a2, 4(a0)
+; RISCV32-NEXT: sb a3, 8(a0)
; RISCV32-NEXT: ret
%t0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x0, i32 %y0)
%s0 = extractvalue { i32, i1 } %t0, 0
diff --git a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
index 84526a1fca0f9..af78eff8c3382 100644
--- a/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/arith-with-overflow.ll
@@ -39,9 +39,9 @@ entry:
define i1 @uadd(i32 %a, i32 %b, ptr %c) nounwind {
; RV32I-LABEL: uadd:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: sltu a0, a1, a0
-; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: sltu a0, a3, a1
+; RV32I-NEXT: sw a3, 0(a2)
; RV32I-NEXT: ret
entry:
%x = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 0306bb18c2aed..998bed0cc958a 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -69,12 +69,12 @@ define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
;
; RV64-LABEL: uaddo1_math_overflow_used:
; RV64: # %bb.0:
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: bltu a0, a1, .LBB1_2
+; RV64-NEXT: add a3, a1, a0
+; RV64-NEXT: bltu a3, a0, .LBB1_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a1, 42
; RV64-NEXT: .LBB1_2:
-; RV64-NEXT: sd a0, 0(a2)
+; RV64-NEXT: sd a3, 0(a2)
; RV64-NEXT: mv a0, a1
; RV64-NEXT: ret
%add = add i64 %b, %a
@@ -143,12 +143,12 @@ define i64 @uaddo2_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
;
; RV64-LABEL: uaddo2_math_overflow_used:
; RV64: # %bb.0:
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: bltu a0, a1, .LBB3_2
+; RV64-NEXT: add a3, a1, a0
+; RV64-NEXT: bltu a3, a0, .LBB3_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a1, 42
; RV64-NEXT: .LBB3_2:
-; RV64-NEXT: sd a0, 0(a2)
+; RV64-NEXT: sd a3, 0(a2)
; RV64-NEXT: mv a0, a1
; RV64-NEXT: ret
%add = add i64 %b, %a
@@ -217,12 +217,12 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
;
; RV64-LABEL: uaddo3_math_overflow_used:
; RV64: # %bb.0:
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: bltu a0, a1, .LBB5_2
+; RV64-NEXT: add a3, a1, a0
+; RV64-NEXT: bltu a3, a0, .LBB5_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a1, 42
; RV64-NEXT: .LBB5_2:
-; RV64-NEXT: sd a0, 0(a2)
+; RV64-NEXT: sd a3, 0(a2)
; RV64-NEXT: mv a0, a1
; RV64-NEXT: ret
%add = add i64 %b, %a
diff --git a/llvm/test/CodeGen/RISCV/uadd_sat.ll b/llvm/test/CodeGen/RISCV/uadd_sat.ll
index 4e0c4ab750592..a896aa89cfda2 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat.ll
@@ -7,10 +7,10 @@
define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
; RV32I-LABEL: func:
; RV32I: # %bb.0:
-; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: sltu a0, a1, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: sltu a1, a0, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: ret
;
; RV64I-LABEL: func:
@@ -56,10 +56,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
;
; RV64I-LABEL: func2:
; RV64I: # %bb.0:
-; RV64I-NEXT: add a1, a0, a1
-; RV64I-NEXT: sltu a0, a1, a0
-; RV64I-NEXT: neg a0, a0
-; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: sltu a1, a0, a1
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func2:
diff --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
index a6afef4286dea..82bcff51b4c4d 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
@@ -8,10 +8,10 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
; RV32I-LABEL: func32:
; RV32I: # %bb.0:
; RV32I-NEXT: mul a1, a1, a2
-; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: sltu a0, a1, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: sltu a1, a0, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: ret
;
; RV64I-LABEL: func32:
@@ -63,10 +63,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
;
; RV64I-LABEL: func64:
; RV64I: # %bb.0:
-; RV64I-NEXT: add a2, a0, a2
-; RV64I-NEXT: sltu a0, a2, a0
-; RV64I-NEXT: neg a0, a0
-; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: add a0, a0, a2
+; RV64I-NEXT: sltu a1, a0, a2
+; RV64I-NEXT: neg a1, a1
+; RV64I-NEXT: or a0, a1, a0
; RV64I-NEXT: ret
;
; RV32IZbb-LABEL: func64:
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 938e6550387f5..4b1e20696dbc8 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -23,50 +23,50 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
; RISCV32-NEXT: mul t2, t0, a6
; RISCV32-NEXT: mulhu t3, t0, a6
; RISCV32-NEXT: mul t4, a4, a5
-; RISCV32-NEXT: mulhu t5, a4, a5
+; RISCV32-NEXT: mulhu s0, a4, a5
; RISCV32-NEXT: mul s2, t0, a5
-; RISCV32-NEXT: mul t6, a7, a4
+; RISCV32-NEXT: mul s1, a7, a4
; RISCV32-NEXT: mul s3, a3, a6
-; RISCV32-NEXT: mul s0, t0, a7
-; RISCV32-NEXT: mul s1, a2, a4
+; RISCV32-NEXT: mul t6, t0, a7
+; RISCV32-NEXT: mul t5, a2, a4
; RISCV32-NEXT: mul s4, a5, a3
-; RISCV32-NEXT: add s1, s1, s0
-; RISCV32-NEXT: mul s0, a1, a6
-; RISCV32-NEXT: add s4, s0, s4
-; RISCV32-NEXT: mulhu s5, t0, a5
+; RISCV32-NEXT: add t6, t5, t6
+; RISCV32-NEXT: mul t5, a1, a6
+; RISCV32-NEXT: add t5, t5, s4
+; RISCV32-NEXT: mulhu s4, t0, a5
; RISCV32-NEXT: add t1, t2, t1
; RISCV32-NEXT: sltu t2, t1, t2
; RISCV32-NEXT: add t2, t3, t2
-; RISCV32-NEXT: mulhu s0, a7, a4
+; RISCV32-NEXT: mulhu t3, a7, a4
; RISCV32-NEXT: add t1, t4, t1
-; RISCV32-NEXT: sltu t3, t1, t4
-; RISCV32-NEXT: add t3, t5, t3
-; RISCV32-NEXT: mulhu t5, a3, a6
-; RISCV32-NEXT: add t4, s3, t6
-; RISCV32-NEXT: add s1, s0, s1
-; RISCV32-NEXT: add t6, t5, s4
-; RISCV32-NEXT: sltu s3, t4, s3
-; RISCV32-NEXT: add t3, t2, t3
-; RISCV32-NEXT: sltu t2, t3, t2
-; RISCV32-NEXT: add s5, s5, t2
-; RISCV32-NEXT: add s4, t6, s1
+; RISCV32-NEXT: sltu t4, t1, t4
+; RISCV32-NEXT: add t4, s0, t4
+; RISCV32-NEXT: mulhu s0, a3, a6
+; RISCV32-NEXT: add s5, s3, s1
+; RISCV32-NEXT: add s1, t3, t6
+; RISCV32-NEXT: add s0, s0, t5
+; RISCV32-NEXT: sltu t3, s5, s3
+; RISCV32-NEXT: add t4, t2, t4
+; RISCV32-NEXT: sltu t2, t4, t2
+; RISCV32-NEXT: add s4, s4, t2
+; RISCV32-NEXT: add s3, s0, s1
+; RISCV32-NEXT: add t4, s2, t4
+; RISCV32-NEXT: add t2, t4, s5
+; RISCV32-NEXT: sltu s2, t4, s2
+; RISCV32-NEXT: sltu t4, t2, t4
+; RISCV32-NEXT: add s2, s4, s2
+; RISCV32-NEXT: add t3, s3, t3
; RISCV32-NEXT: add t3, s2, t3
-; RISCV32-NEXT: add t2, t3, t4
-; RISCV32-NEXT: sltu s2, t3, s2
-; RISCV32-NEXT: sltu t4, t2, t3
-; RISCV32-NEXT: add s2, s5, s2
-; RISCV32-NEXT: add s3, s4, s3
-; RISCV32-NEXT: add t3, s2, s3
; RISCV32-NEXT: add t3, t3, t4
; RISCV32-NEXT: beq t3, s2, .LBB0_2
; RISCV32-NEXT: # %bb.1: # %start
; RISCV32-NEXT: sltu t4, t3, s2
; RISCV32-NEXT: .LBB0_2: # %start
-; RISCV32-NEXT: sltu s0, s1, s0
+; RISCV32-NEXT: sltu t6, s1, t6
; RISCV32-NEXT: snez s1, t0
; RISCV32-NEXT: snez s2, a2
-; RISCV32-NEXT: sltu t5, t6, t5
-; RISCV32-NEXT: mulhu t6, a2, a4
+; RISCV32-NEXT: sltu t5, s0, t5
+; RISCV32-NEXT: mulhu s0, a2, a4
; RISCV32-NEXT: mulhu t0, t0, a7
; RISCV32-NEXT: or a2, a7, a2
; RISCV32-NEXT: snez a7, a5
@@ -76,19 +76,19 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
; RISCV32-NEXT: or a3, a3, a1
; RISCV32-NEXT: snez a1, a1
; RISCV32-NEXT: and s1, s2, s1
-; RISCV32-NEXT: snez t6, t6
+; RISCV32-NEXT: snez s0, s0
; RISCV32-NEXT: snez t0, t0
; RISCV32-NEXT: and a1, a1, a7
; RISCV32-NEXT: snez a6, a6
; RISCV32-NEXT: snez a5, a5
; RISCV32-NEXT: snez a2, a2
; RISCV32-NEXT: snez a3, a3
-; RISCV32-NEXT: or a7, s1, t6
+; RISCV32-NEXT: or s0, s1, s0
; RISCV32-NEXT: or a1, a1, a6
; RISCV32-NEXT: and a2, a3, a2
-; RISCV32-NEXT: or a3, a7, t0
+; RISCV32-NEXT: or a3, s0, t0
; RISCV32-NEXT: or a1, a1, a5
-; RISCV32-NEXT: or a3, a3, s0
+; RISCV32-NEXT: or a3, a3, t6
; RISCV32-NEXT: or a1, a1, t5
; RISCV32-NEXT: or a1, a2, a1
; RISCV32-NEXT: or a1, a1, a3
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 0de2cbd76b749..7ea2ed7a2063b 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -464,9 +464,9 @@ entry:
define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
; RV32-LABEL: uaddo.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: add a1, a0, a1
-; RV32-NEXT: sltu a0, a1, a0
-; RV32-NEXT: sw a1, 0(a2)
+; RV32-NEXT: add a3, a0, a1
+; RV32-NEXT: sltu a0, a3, a1
+; RV32-NEXT: sw a3, 0(a2)
; RV32-NEXT: ret
;
; RV64-LABEL: uaddo.i32:
@@ -478,9 +478,9 @@ define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZBA-LABEL: uaddo.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: add a1, a0, a1
-; RV32ZBA-NEXT: sltu a0, a1, a0
-; RV32ZBA-NEXT: sw a1, 0(a2)
+; RV32ZBA-NEXT: add a3, a0, a1
+; RV32ZBA-NEXT: sltu a0, a3, a1
+; RV32ZBA-NEXT: sw a3, 0(a2)
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: uaddo.i32:
@@ -492,9 +492,9 @@ define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
;
; RV32ZICOND-LABEL: uaddo.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: add a1, a0, a1
-; RV32ZICOND-NEXT: sltu a0, a1, a0
-; RV32ZICOND-NEXT: sw a1, 0(a2)
+; RV32ZICOND-NEXT: add a3, a0, a1
+; RV32ZICOND-NEXT: sltu a0, a3, a1
+; RV32ZICOND-NEXT: sw a3, 0(a2)
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: uaddo.i32:
@@ -515,7 +515,7 @@ define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
; RV32-LABEL: uaddo.i32.constant:
; RV32: # %bb.0: # %entry
; RV32-NEXT: addi a2, a0, -2
-; RV32-NEXT: sltu a0, a2, a0
+; RV32-NEXT: sltiu a0, a2, -2
; RV32-NEXT: sw a2, 0(a1)
; RV32-NEXT: ret
;
@@ -529,7 +529,7 @@ define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
; RV32ZBA-LABEL: uaddo.i32.constant:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: addi a2, a0, -2
-; RV32ZBA-NEXT: sltu a0, a2, a0
+; RV32ZBA-NEXT: sltiu a0, a2, -2
; RV32ZBA-NEXT: sw a2, 0(a1)
; RV32ZBA-NEXT: ret
;
@@ -543,7 +543,7 @@ define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
; RV32ZICOND-LABEL: uaddo.i32.constant:
; RV32ZICOND: # %bb.0: # %entry
; RV32ZICOND-NEXT: addi a2, a0, -2
-; RV32ZICOND-NEXT: sltu a0, a2, a0
+; RV32ZICOND-NEXT: sltiu a0, a2, -2
; RV32ZICOND-NEXT: sw a2, 0(a1)
; RV32ZICOND-NEXT: ret
;
@@ -628,9 +628,9 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64-LABEL: uaddo.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: add a1, a0, a1
-; RV64-NEXT: sltu a0, a1, a0
-; RV64-NEXT: sd a1, 0(a2)
+; RV64-NEXT: add a3, a0, a1
+; RV64-NEXT: sltu a0, a3, a1
+; RV64-NEXT: sd a3, 0(a2)
; RV64-NEXT: ret
;
; RV32ZBA-LABEL: uaddo.i64:
@@ -649,9 +649,9 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZBA-LABEL: uaddo.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: add a1, a0, a1
-; RV64ZBA-NEXT: sltu a0, a1, a0
-; RV64ZBA-NEXT: sd a1, 0(a2)
+; RV64ZBA-NEXT: add a3, a0, a1
+; RV64ZBA-NEXT: sltu a0, a3, a1
+; RV64ZBA-NEXT: sd a3, 0(a2)
; RV64ZBA-NEXT: ret
;
; RV32ZICOND-LABEL: uaddo.i64:
@@ -671,9 +671,9 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV64ZICOND-LABEL: uaddo.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: add a1, a0, a1
-; RV64ZICOND-NEXT: sltu a0, a1, a0
-; RV64ZICOND-NEXT: sd a1, 0(a2)
+; RV64ZICOND-NEXT: add a3, a0, a1
+; RV64ZICOND-NEXT: sltu a0, a3, a1
+; RV64ZICOND-NEXT: sd a3, 0(a2)
; RV64ZICOND-NEXT: ret
entry:
%t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
@@ -1788,13 +1788,13 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
; RV32-NEXT: and a1, a1, t0
; RV32-NEXT: snez a0, a0
; RV32-NEXT: snez a2, a3
-; RV32-NEXT: add a5, a7, a5
+; RV32-NEXT: add a7, a7, a5
; RV32-NEXT: or a0, a1, a0
-; RV32-NEXT: sltu a1, a5, a7
+; RV32-NEXT: sltu a1, a7, a5
; RV32-NEXT: or a0, a0, a2
; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: sw t1, 0(a4)
-; RV32-NEXT: sw a5, 4(a4)
+; RV32-NEXT: sw a7, 4(a4)
; RV32-NEXT: ret
;
; RV64-LABEL: umulo.i64:
@@ -1820,13 +1820,13 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
; RV32ZBA-NEXT: and a1, a1, t0
; RV32ZBA-NEXT: snez a0, a0
; RV32ZBA-NEXT: snez a2, a3
-; RV32ZBA-NEXT: add a5, a7, a5
+; RV32ZBA-NEXT: add a7, a7, a5
; RV32ZBA-NEXT: or a0, a1, a0
-; RV32ZBA-NEXT: sltu a1, a5, a7
+; RV32ZBA-NEXT: sltu a1, a7, a5
; RV32ZBA-NEXT: or a0, a0, a2
; RV32ZBA-NEXT: or a0, a0, a1
; RV32ZBA-NEXT: sw t1, 0(a4)
-; RV32ZBA-NEXT: sw a5, 4(a4)
+; RV32ZBA-NEXT: sw a7, 4(a4)
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: umulo.i64:
@@ -1851,13 +1851,13 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
; RV32ZICOND-NEXT: czero.eqz a1, a6, a1
; RV32ZICOND-NEXT: snez a0, a0
; RV32ZICOND-NEXT: snez a2, a3
-; RV32ZICOND-NEXT: add a5, a7, a5
+; RV32ZICOND-NEXT: add a7, a7, a5
; RV32ZICOND-NEXT: or a0, a1, a0
-; RV32ZICOND-NEXT: sltu a1, a5, a7
+; RV32ZICOND-NEXT: sltu a1, a7, a5
; RV32ZICOND-NEXT: or a0, a0, a2
; RV32ZICOND-NEXT: or a0, a0, a1
; RV32ZICOND-NEXT: sw t0, 0(a4)
-; RV32ZICOND-NEXT: sw a5, 4(a4)
+; RV32ZICOND-NEXT: sw a7, 4(a4)
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: umulo.i64:
@@ -1884,12 +1884,12 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
; RV32-NEXT: mulhu a5, a0, a3
; RV32-NEXT: mulhu a1, a1, a3
; RV32-NEXT: mul a3, a0, a3
-; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: add a5, a5, a4
; RV32-NEXT: snez a0, a1
-; RV32-NEXT: sltu a1, a4, a5
+; RV32-NEXT: sltu a1, a5, a4
; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: sw a3, 0(a2)
-; RV32-NEXT: sw a4, 4(a2)
+; RV32-NEXT: sw a5, 4(a2)
; RV32-NEXT: ret
;
; RV64-LABEL: umulo2.i64:
@@ -1911,12 +1911,12 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
; RV32ZBA-NEXT: mulhu a1, a1, a3
; RV32ZBA-NEXT: mulhu a3, a0, a3
; RV32ZBA-NEXT: sh2add a5, a5, a0
-; RV32ZBA-NEXT: add a4, a3, a4
+; RV32ZBA-NEXT: add a3, a3, a4
; RV32ZBA-NEXT: snez a0, a1
-; RV32ZBA-NEXT: sltu a1, a4, a3
+; RV32ZBA-NEXT: sltu a1, a3, a4
; RV32ZBA-NEXT: or a0, a0, a1
; RV32ZBA-NEXT: sw a5, 0(a2)
-; RV32ZBA-NEXT: sw a4, 4(a2)
+; RV32ZBA-NEXT: sw a3, 4(a2)
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: umulo2.i64:
@@ -1937,12 +1937,12 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
; RV32ZICOND-NEXT: mulhu a5, a0, a3
; RV32ZICOND-NEXT: mulhu a1, a1, a3
; RV32ZICOND-NEXT: mul a3, a0, a3
-; RV32ZICOND-NEXT: add a4, a5, a4
+; RV32ZICOND-NEXT: add a5, a5, a4
; RV32ZICOND-NEXT: snez a0, a1
-; RV32ZICOND-NEXT: sltu a1, a4, a5
+; RV32ZICOND-NEXT: sltu a1, a5, a4
; RV32ZICOND-NEXT: or a0, a0, a1
; RV32ZICOND-NEXT: sw a3, 0(a2)
-; RV32ZICOND-NEXT: sw a4, 4(a2)
+; RV32ZICOND-NEXT: sw a5, 4(a2)
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: umulo2.i64:
@@ -2266,7 +2266,7 @@ define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: uaddo.select.i32:
; RV32: # %bb.0: # %entry
; RV32-NEXT: add a2, a0, a1
-; RV32-NEXT: bltu a2, a0, .LBB32_2
+; RV32-NEXT: bltu a2, a1, .LBB32_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: mv a0, a1
; RV32-NEXT: .LBB32_2: # %entry
@@ -2284,7 +2284,7 @@ define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
; RV32ZBA-LABEL: uaddo.select.i32:
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: add a2, a0, a1
-; RV32ZBA-NEXT: bltu a2, a0, .LBB32_2
+; RV32ZBA-NEXT: bltu a2, a1, .LBB32_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: mv a0, a1
; RV32ZBA-NEXT: .LBB32_2: # %entry
@@ -2302,7 +2302,7 @@ define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
; RV32ZICOND-LABEL: uaddo.select.i32:
; RV32ZICOND: # %bb.0: # %entry
; RV32ZICOND-NEXT: add a2, a0, a1
-; RV32ZICOND-NEXT: sltu a2, a2, a0
+; RV32ZICOND-NEXT: sltu a2, a2, a1
; RV32ZICOND-NEXT: czero.nez a1, a1, a2
; RV32ZICOND-NEXT: czero.eqz a0, a0, a2
; RV32ZICOND-NEXT: or a0, a0, a1
@@ -2326,8 +2326,8 @@ entry:
define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
; RV32-LABEL: uaddo.not.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: add a1, a0, a1
-; RV32-NEXT: sltu a0, a1, a0
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sltu a0, a0, a1
; RV32-NEXT: xori a0, a0, 1
; RV32-NEXT: ret
;
@@ -2340,8 +2340,8 @@ define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZBA-LABEL: uaddo.not.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: add a1, a0, a1
-; RV32ZBA-NEXT: sltu a0, a1, a0
+; RV32ZBA-NEXT: add a0, a0, a1
+; RV32ZBA-NEXT: sltu a0, a0, a1
; RV32ZBA-NEXT: xori a0, a0, 1
; RV32ZBA-NEXT: ret
;
@@ -2354,8 +2354,8 @@ define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
;
; RV32ZICOND-LABEL: uaddo.not.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: add a1, a0, a1
-; RV32ZICOND-NEXT: sltu a0, a1, a0
+; RV32ZICOND-NEXT: add a0, a0, a1
+; RV32ZICOND-NEXT: sltu a0, a0, a1
; RV32ZICOND-NEXT: xori a0, a0, 1
; RV32ZICOND-NEXT: ret
;
@@ -2395,7 +2395,7 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
; RV64-LABEL: uaddo.select.i64:
; RV64: # %bb.0: # %entry
; RV64-NEXT: add a2, a0, a1
-; RV64-NEXT: bltu a2, a0, .LBB34_2
+; RV64-NEXT: bltu a2, a1, .LBB34_2
; RV64-NEXT: # %bb.1: # %entry
; RV64-NEXT: mv a0, a1
; RV64-NEXT: .LBB34_2: # %entry
@@ -2423,7 +2423,7 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
; RV64ZBA-LABEL: uaddo.select.i64:
; RV64ZBA: # %bb.0: # %entry
; RV64ZBA-NEXT: add a2, a0, a1
-; RV64ZBA-NEXT: bltu a2, a0, .LBB34_2
+; RV64ZBA-NEXT: bltu a2, a1, .LBB34_2
; RV64ZBA-NEXT: # %bb.1: # %entry
; RV64ZBA-NEXT: mv a0, a1
; RV64ZBA-NEXT: .LBB34_2: # %entry
@@ -2451,7 +2451,7 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
; RV64ZICOND-LABEL: uaddo.select.i64:
; RV64ZICOND: # %bb.0: # %entry
; RV64ZICOND-NEXT: add a2, a0, a1
-; RV64ZICOND-NEXT: sltu a2, a2, a0
+; RV64ZICOND-NEXT: sltu a2, a2, a1
; RV64ZICOND-NEXT: czero.nez a1, a1, a2
; RV64ZICOND-NEXT: czero.eqz a0, a0, a2
; RV64ZICOND-NEXT: or a0, a0, a1
@@ -2479,8 +2479,8 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: uaddo.not.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: add a1, a0, a1
-; RV64-NEXT: sltu a0, a1, a0
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sltu a0, a0, a1
; RV64-NEXT: xori a0, a0, 1
; RV64-NEXT: ret
;
@@ -2499,8 +2499,8 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: uaddo.not.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: add a1, a0, a1
-; RV64ZBA-NEXT: sltu a0, a1, a0
+; RV64ZBA-NEXT: add a0, a0, a1
+; RV64ZBA-NEXT: sltu a0, a0, a1
; RV64ZBA-NEXT: xori a0, a0, 1
; RV64ZBA-NEXT: ret
;
@@ -2520,8 +2520,8 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: uaddo.not.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: add a1, a0, a1
-; RV64ZICOND-NEXT: sltu a0, a1, a0
+; RV64ZICOND-NEXT: add a0, a0, a1
+; RV64ZICOND-NEXT: sltu a0, a0, a1
; RV64ZICOND-NEXT: xori a0, a0, 1
; RV64ZICOND-NEXT: ret
entry:
@@ -3623,8 +3623,8 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: snez a6, a6
; RV32-NEXT: or a5, a5, a6
; RV32-NEXT: mulhu a6, a0, a2
-; RV32-NEXT: add a4, a6, a4
-; RV32-NEXT: sltu a4, a4, a6
+; RV32-NEXT: add a6, a6, a4
+; RV32-NEXT: sltu a4, a6, a4
; RV32-NEXT: mulhu a6, a3, a0
; RV32-NEXT: snez a6, a6
; RV32-NEXT: or a5, a5, a6
@@ -3657,8 +3657,8 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: snez a6, a6
; RV32ZBA-NEXT: or a5, a5, a6
; RV32ZBA-NEXT: mulhu a6, a0, a2
-; RV32ZBA-NEXT: add a4, a6, a4
-; RV32ZBA-NEXT: sltu a4, a4, a6
+; RV32ZBA-NEXT: add a6, a6, a4
+; RV32ZBA-NEXT: sltu a4, a6, a4
; RV32ZBA-NEXT: mulhu a6, a3, a0
; RV32ZBA-NEXT: snez a6, a6
; RV32ZBA-NEXT: or a5, a5, a6
@@ -3690,8 +3690,8 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZICOND-NEXT: snez a5, a5
; RV32ZICOND-NEXT: or a5, a6, a5
; RV32ZICOND-NEXT: mulhu a6, a0, a2
-; RV32ZICOND-NEXT: add a4, a6, a4
-; RV32ZICOND-NEXT: sltu a4, a4, a6
+; RV32ZICOND-NEXT: add a6, a6, a4
+; RV32ZICOND-NEXT: sltu a4, a6, a4
; RV32ZICOND-NEXT: mulhu a6, a3, a0
; RV32ZICOND-NEXT: snez a6, a6
; RV32ZICOND-NEXT: or a5, a5, a6
@@ -3732,9 +3732,9 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: and a1, a1, a3
; RV32-NEXT: snez a2, a2
; RV32-NEXT: snez a0, a0
-; RV32-NEXT: add a4, a6, a4
+; RV32-NEXT: add a6, a6, a4
; RV32-NEXT: or a1, a1, a2
-; RV32-NEXT: sltu a2, a4, a6
+; RV32-NEXT: sltu a2, a6, a4
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: or a0, a0, a2
; RV32-NEXT: xori a0, a0, 1
@@ -3759,9 +3759,9 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: and a1, a1, a3
; RV32ZBA-NEXT: snez a2, a2
; RV32ZBA-NEXT: snez a0, a0
-; RV32ZBA-NEXT: add a4, a6, a4
+; RV32ZBA-NEXT: add a6, a6, a4
; RV32ZBA-NEXT: or a1, a1, a2
-; RV32ZBA-NEXT: sltu a2, a4, a6
+; RV32ZBA-NEXT: sltu a2, a6, a4
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: or a0, a0, a2
; RV32ZBA-NEXT: xori a0, a0, 1
@@ -3785,9 +3785,9 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZICOND-NEXT: czero.eqz a1, a3, a1
; RV32ZICOND-NEXT: snez a2, a2
; RV32ZICOND-NEXT: snez a0, a0
-; RV32ZICOND-NEXT: add a4, a5, a4
+; RV32ZICOND-NEXT: add a5, a5, a4
; RV32ZICOND-NEXT: or a1, a1, a2
-; RV32ZICOND-NEXT: sltu a2, a4, a5
+; RV32ZICOND-NEXT: sltu a2, a5, a4
; RV32ZICOND-NEXT: or a0, a1, a0
; RV32ZICOND-NEXT: or a0, a0, a2
; RV32ZICOND-NEXT: xori a0, a0, 1
@@ -4005,8 +4005,8 @@ continue:
define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
; RV32-LABEL: uaddo.br.i32:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: add a1, a0, a1
-; RV32-NEXT: bgeu a1, a0, .LBB54_2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: bgeu a0, a1, .LBB54_2
; RV32-NEXT: # %bb.1: # %overflow
; RV32-NEXT: li a0, 0
; RV32-NEXT: ret
@@ -4028,8 +4028,8 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
;
; RV32ZBA-LABEL: uaddo.br.i32:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: add a1, a0, a1
-; RV32ZBA-NEXT: bgeu a1, a0, .LBB54_2
+; RV32ZBA-NEXT: add a0, a0, a1
+; RV32ZBA-NEXT: bgeu a0, a1, .LBB54_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
; RV32ZBA-NEXT: li a0, 0
; RV32ZBA-NEXT: ret
@@ -4051,8 +4051,8 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
;
; RV32ZICOND-LABEL: uaddo.br.i32:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: add a1, a0, a1
-; RV32ZICOND-NEXT: bgeu a1, a0, .LBB54_2
+; RV32ZICOND-NEXT: add a0, a0, a1
+; RV32ZICOND-NEXT: bgeu a0, a1, .LBB54_2
; RV32ZICOND-NEXT: # %bb.1: # %overflow
; RV32ZICOND-NEXT: li a0, 0
; RV32ZICOND-NEXT: ret
@@ -4105,8 +4105,8 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
;
; RV64-LABEL: uaddo.br.i64:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: add a1, a0, a1
-; RV64-NEXT: bgeu a1, a0, .LBB55_2
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: bgeu a0, a1, .LBB55_2
; RV64-NEXT: # %bb.1: # %overflow
; RV64-NEXT: li a0, 0
; RV64-NEXT: ret
@@ -4134,8 +4134,8 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZBA-LABEL: uaddo.br.i64:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: add a1, a0, a1
-; RV64ZBA-NEXT: bgeu a1, a0, .LBB55_2
+; RV64ZBA-NEXT: add a0, a0, a1
+; RV64ZBA-NEXT: bgeu a0, a1, .LBB55_2
; RV64ZBA-NEXT: # %bb.1: # %overflow
; RV64ZBA-NEXT: li a0, 0
; RV64ZBA-NEXT: ret
@@ -4164,8 +4164,8 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
;
; RV64ZICOND-LABEL: uaddo.br.i64:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: add a1, a0, a1
-; RV64ZICOND-NEXT: bgeu a1, a0, .LBB55_2
+; RV64ZICOND-NEXT: add a0, a0, a1
+; RV64ZICOND-NEXT: bgeu a0, a1, .LBB55_2
; RV64ZICOND-NEXT: # %bb.1: # %overflow
; RV64ZICOND-NEXT: li a0, 0
; RV64ZICOND-NEXT: ret
@@ -5077,9 +5077,9 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: and a1, a1, a3
; RV32-NEXT: snez a2, a2
; RV32-NEXT: snez a0, a0
-; RV32-NEXT: add a4, a6, a4
+; RV32-NEXT: add a6, a6, a4
; RV32-NEXT: or a1, a1, a2
-; RV32-NEXT: sltu a2, a4, a6
+; RV32-NEXT: sltu a2, a6, a4
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: or a0, a0, a2
; RV32-NEXT: beqz a0, .LBB64_2
@@ -5114,9 +5114,9 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: and a1, a1, a3
; RV32ZBA-NEXT: snez a2, a2
; RV32ZBA-NEXT: snez a0, a0
-; RV32ZBA-NEXT: add a4, a6, a4
+; RV32ZBA-NEXT: add a6, a6, a4
; RV32ZBA-NEXT: or a1, a1, a2
-; RV32ZBA-NEXT: sltu a2, a4, a6
+; RV32ZBA-NEXT: sltu a2, a6, a4
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: or a0, a0, a2
; RV32ZBA-NEXT: beqz a0, .LBB64_2
@@ -5150,9 +5150,9 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZICOND-NEXT: czero.eqz a1, a3, a1
; RV32ZICOND-NEXT: snez a2, a2
; RV32ZICOND-NEXT: snez a0, a0
-; RV32ZICOND-NEXT: add a4, a5, a4
+; RV32ZICOND-NEXT: add a5, a5, a4
; RV32ZICOND-NEXT: or a1, a1, a2
-; RV32ZICOND-NEXT: sltu a2, a4, a5
+; RV32ZICOND-NEXT: sltu a2, a5, a4
; RV32ZICOND-NEXT: or a0, a1, a0
; RV32ZICOND-NEXT: or a0, a0, a2
; RV32ZICOND-NEXT: beqz a0, .LBB64_2
@@ -5302,7 +5302,7 @@ define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
; RV64-LABEL: uaddo.i64.constant:
; RV64: # %bb.0: # %entry
; RV64-NEXT: addi a2, a0, 2
-; RV64-NEXT: sltu a0, a2, a0
+; RV64-NEXT: sltiu a0, a2, 2
; RV64-NEXT: sd a2, 0(a1)
; RV64-NEXT: ret
;
@@ -5320,7 +5320,7 @@ define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
; RV64ZBA-LABEL: uaddo.i64.constant:
; RV64ZBA: # %bb.0: # %entry
; RV64ZBA-NEXT: addi a2, a0, 2
-; RV64ZBA-NEXT: sltu a0, a2, a0
+; RV64ZBA-NEXT: sltiu a0, a2, 2
; RV64ZBA-NEXT: sd a2, 0(a1)
; RV64ZBA-NEXT: ret
;
@@ -5338,7 +5338,7 @@ define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
; RV64ZICOND-LABEL: uaddo.i64.constant:
; RV64ZICOND: # %bb.0: # %entry
; RV64ZICOND-NEXT: addi a2, a0, 2
-; RV64ZICOND-NEXT: sltu a0, a2, a0
+; RV64ZICOND-NEXT: sltiu a0, a2, 2
; RV64ZICOND-NEXT: sd a2, 0(a1)
; RV64ZICOND-NEXT: ret
entry:
@@ -5364,9 +5364,10 @@ define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
;
; RV64-LABEL: uaddo.i64.constant_2048:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: addi a2, a0, 2047
-; RV64-NEXT: addi a2, a2, 1
-; RV64-NEXT: sltu a0, a2, a0
+; RV64-NEXT: addi a0, a0, 2047
+; RV64-NEXT: addi a2, a0, 1
+; RV64-NEXT: srli a0, a2, 11
+; RV64-NEXT: seqz a0, a0
; RV64-NEXT: sd a2, 0(a1)
; RV64-NEXT: ret
;
@@ -5384,9 +5385,10 @@ define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
;
; RV64ZBA-LABEL: uaddo.i64.constant_2048:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: addi a2, a0, 2047
-; RV64ZBA-NEXT: addi a2, a2, 1
-; RV64ZBA-NEXT: sltu a0, a2, a0
+; RV64ZBA-NEXT: addi a0, a0, 2047
+; RV64ZBA-NEXT: addi a2, a0, 1
+; RV64ZBA-NEXT: srli a0, a2, 11
+; RV64ZBA-NEXT: seqz a0, a0
; RV64ZBA-NEXT: sd a2, 0(a1)
; RV64ZBA-NEXT: ret
;
@@ -5404,9 +5406,10 @@ define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
;
; RV64ZICOND-LABEL: uaddo.i64.constant_2048:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: addi a2, a0, 2047
-; RV64ZICOND-NEXT: addi a2, a2, 1
-; RV64ZICOND-NEXT: sltu a0, a2, a0
+; RV64ZICOND-NEXT: addi a0, a0, 2047
+; RV64ZICOND-NEXT: addi a2, a0, 1
+; RV64ZICOND-NEXT: srli a0, a2, 11
+; RV64ZICOND-NEXT: seqz a0, a0
; RV64ZICOND-NEXT: sd a2, 0(a1)
; RV64ZICOND-NEXT: ret
entry:
@@ -5432,10 +5435,11 @@ define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
;
; RV64-LABEL: uaddo.i64.constant_2049:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: addi a2, a0, 2047
-; RV64-NEXT: addi a2, a2, 2
-; RV64-NEXT: sltu a0, a2, a0
-; RV64-NEXT: sd a2, 0(a1)
+; RV64-NEXT: lui a2, 1
+; RV64-NEXT: addi a2, a2, -2047
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: sltu a0, a3, a2
+; RV64-NEXT: sd a3, 0(a1)
; RV64-NEXT: ret
;
; RV32ZBA-LABEL: uaddo.i64.constant_2049:
@@ -5452,10 +5456,11 @@ define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
;
; RV64ZBA-LABEL: uaddo.i64.constant_2049:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: addi a2, a0, 2047
-; RV64ZBA-NEXT: addi a2, a2, 2
-; RV64ZBA-NEXT: sltu a0, a2, a0
-; RV64ZBA-NEXT: sd a2, 0(a1)
+; RV64ZBA-NEXT: lui a2, 1
+; RV64ZBA-NEXT: addi a2, a2, -2047
+; RV64ZBA-NEXT: add a3, a0, a2
+; RV64ZBA-NEXT: sltu a0, a3, a2
+; RV64ZBA-NEXT: sd a3, 0(a1)
; RV64ZBA-NEXT: ret
;
; RV32ZICOND-LABEL: uaddo.i64.constant_2049:
@@ -5472,10 +5477,11 @@ define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
;
; RV64ZICOND-LABEL: uaddo.i64.constant_2049:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: addi a2, a0, 2047
-; RV64ZICOND-NEXT: addi a2, a2, 2
-; RV64ZICOND-NEXT: sltu a0, a2, a0
-; RV64ZICOND-NEXT: sd a2, 0(a1)
+; RV64ZICOND-NEXT: lui a2, 1
+; RV64ZICOND-NEXT: addi a2, a2, -2047
+; RV64ZICOND-NEXT: add a3, a0, a2
+; RV64ZICOND-NEXT: sltu a0, a3, a2
+; RV64ZICOND-NEXT: sd a3, 0(a1)
; RV64ZICOND-NEXT: ret
entry:
%t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2049)
@@ -5504,8 +5510,9 @@ define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
;
; RV64-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: ld a1, 0(a0)
-; RV64-NEXT: addi a0, a1, 2
+; RV64-NEXT: ld a0, 0(a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: li a1, 2
; RV64-NEXT: bltu a0, a1, .LBB69_2
; RV64-NEXT: # %bb.1: # %IfOverflow
; RV64-NEXT: li a0, 0
@@ -5530,8 +5537,9 @@ define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
;
; RV64ZBA-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
; RV64ZBA: # %bb.0: # %entry
-; RV64ZBA-NEXT: ld a1, 0(a0)
-; RV64ZBA-NEXT: addi a0, a1, 2
+; RV64ZBA-NEXT: ld a0, 0(a0)
+; RV64ZBA-NEXT: addi a0, a0, 2
+; RV64ZBA-NEXT: li a1, 2
; RV64ZBA-NEXT: bltu a0, a1, .LBB69_2
; RV64ZBA-NEXT: # %bb.1: # %IfOverflow
; RV64ZBA-NEXT: li a0, 0
@@ -5556,8 +5564,9 @@ define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
;
; RV64ZICOND-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
; RV64ZICOND: # %bb.0: # %entry
-; RV64ZICOND-NEXT: ld a1, 0(a0)
-; RV64ZICOND-NEXT: addi a0, a1, 2
+; RV64ZICOND-NEXT: ld a0, 0(a0)
+; RV64ZICOND-NEXT: addi a0, a0, 2
+; RV64ZICOND-NEXT: li a1, 2
; RV64ZICOND-NEXT: bltu a0, a1, .LBB69_2
; RV64ZICOND-NEXT: # %bb.1: # %IfOverflow
; RV64ZICOND-NEXT: li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/xqcia.ll b/llvm/test/CodeGen/RISCV/xqcia.ll
index 6d5fc765c49a8..8d1991f01f0dd 100644
--- a/llvm/test/CodeGen/RISCV/xqcia.ll
+++ b/llvm/test/CodeGen/RISCV/xqcia.ll
@@ -31,10 +31,10 @@ define i32 @addsat(i32 %a, i32 %b) {
define i32 @addusat(i32 %a, i32 %b) {
; RV32I-LABEL: addusat:
; RV32I: # %bb.0:
-; RV32I-NEXT: add a1, a0, a1
-; RV32I-NEXT: sltu a0, a1, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: sltu a1, a0, a1
+; RV32I-NEXT: neg a1, a1
+; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: ret
;
; RV32IXQCIA-LABEL: addusat:
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 40e90d6bdd6af..12818f3a7b78b 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -10,42 +10,42 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
; SPARC-NEXT: mov %i3, %g2
; SPARC-NEXT: mov %i2, %g4
; SPARC-NEXT: umul %i2, %i5, %i2
-; SPARC-NEXT: rd %y, %l7
-; SPARC-NEXT: ld [%fp+92], %l4
+; SPARC-NEXT: rd %y, %l5
+; SPARC-NEXT: ld [%fp+92], %l1
; SPARC-NEXT: umul %i4, %i3, %i3
-; SPARC-NEXT: rd %y, %o1
+; SPARC-NEXT: rd %y, %l6
; SPARC-NEXT: ld [%fp+96], %g3
; SPARC-NEXT: umul %i5, %g2, %l3
-; SPARC-NEXT: rd %y, %o0
-; SPARC-NEXT: umul %l4, %i1, %l2
-; SPARC-NEXT: rd %y, %l1
-; SPARC-NEXT: add %i3, %i2, %i2
-; SPARC-NEXT: umul %i0, %g3, %i3
-; SPARC-NEXT: rd %y, %l6
-; SPARC-NEXT: add %o0, %i2, %o2
-; SPARC-NEXT: umul %i1, %g3, %i2
+; SPARC-NEXT: rd %y, %l4
+; SPARC-NEXT: umul %l1, %i1, %l7
; SPARC-NEXT: rd %y, %l0
-; SPARC-NEXT: add %i3, %l2, %i3
-; SPARC-NEXT: add %l0, %i3, %l2
-; SPARC-NEXT: addcc %i2, %l3, %l3
+; SPARC-NEXT: add %i3, %i2, %o1
+; SPARC-NEXT: umul %i0, %g3, %i2
+; SPARC-NEXT: rd %y, %l2
+; SPARC-NEXT: add %l4, %o1, %o2
+; SPARC-NEXT: umul %i1, %g3, %i3
+; SPARC-NEXT: rd %y, %l4
+; SPARC-NEXT: add %i2, %l7, %l7
+; SPARC-NEXT: add %l4, %l7, %o0
+; SPARC-NEXT: addcc %i3, %l3, %l3
; SPARC-NEXT: umul %g2, %g3, %i3
; SPARC-NEXT: rd %y, %i2
-; SPARC-NEXT: addxcc %l2, %o2, %o4
+; SPARC-NEXT: addxcc %o0, %o2, %o4
; SPARC-NEXT: umul %g4, %g3, %g3
-; SPARC-NEXT: rd %y, %l5
+; SPARC-NEXT: rd %y, %l4
; SPARC-NEXT: addcc %g3, %i2, %i2
-; SPARC-NEXT: addxcc %l5, 0, %g3
-; SPARC-NEXT: umul %g2, %l4, %g2
-; SPARC-NEXT: rd %y, %l5
+; SPARC-NEXT: addxcc %l4, 0, %g3
+; SPARC-NEXT: umul %g2, %l1, %g2
+; SPARC-NEXT: rd %y, %l4
; SPARC-NEXT: addcc %g2, %i2, %i2
-; SPARC-NEXT: addxcc %l5, 0, %g2
+; SPARC-NEXT: addxcc %l4, 0, %g2
; SPARC-NEXT: addcc %g3, %g2, %g2
; SPARC-NEXT: addxcc %g0, 0, %g3
-; SPARC-NEXT: umul %g4, %l4, %l5
+; SPARC-NEXT: umul %g4, %l1, %l4
; SPARC-NEXT: rd %y, %o3
-; SPARC-NEXT: addcc %l5, %g2, %l5
+; SPARC-NEXT: addcc %l4, %g2, %l4
; SPARC-NEXT: addxcc %o3, %g3, %o3
-; SPARC-NEXT: addcc %l5, %l3, %g2
+; SPARC-NEXT: addcc %l4, %l3, %g2
; SPARC-NEXT: addxcc %o3, %o4, %g3
; SPARC-NEXT: mov 1, %l3
; SPARC-NEXT: cmp %g3, %o3
@@ -54,101 +54,101 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
; SPARC-NEXT: ! %bb.1: ! %start
; SPARC-NEXT: mov %g0, %o4
; SPARC-NEXT: .LBB0_2: ! %start
-; SPARC-NEXT: cmp %g2, %l5
+; SPARC-NEXT: cmp %g2, %l4
; SPARC-NEXT: bcs .LBB0_4
-; SPARC-NEXT: mov %l3, %l5
+; SPARC-NEXT: mov %l3, %l4
; SPARC-NEXT: ! %bb.3: ! %start
-; SPARC-NEXT: mov %g0, %l5
+; SPARC-NEXT: mov %g0, %l4
; SPARC-NEXT: .LBB0_4: ! %start
; SPARC-NEXT: cmp %g3, %o3
; SPARC-NEXT: be .LBB0_6
; SPARC-NEXT: nop
; SPARC-NEXT: ! %bb.5: ! %start
-; SPARC-NEXT: mov %o4, %l5
+; SPARC-NEXT: mov %o4, %l4
; SPARC-NEXT: .LBB0_6: ! %start
-; SPARC-NEXT: cmp %g4, 0
-; SPARC-NEXT: bne .LBB0_8
-; SPARC-NEXT: mov %l3, %o3
+; SPARC-NEXT: cmp %o2, %o1
+; SPARC-NEXT: bcs .LBB0_8
+; SPARC-NEXT: mov %l3, %o1
; SPARC-NEXT: ! %bb.7: ! %start
-; SPARC-NEXT: mov %g0, %o3
+; SPARC-NEXT: mov %g0, %o1
; SPARC-NEXT: .LBB0_8: ! %start
-; SPARC-NEXT: cmp %i4, 0
+; SPARC-NEXT: cmp %g4, 0
; SPARC-NEXT: bne .LBB0_10
-; SPARC-NEXT: mov %l3, %o4
+; SPARC-NEXT: mov %l3, %o2
; SPARC-NEXT: ! %bb.9: ! %start
-; SPARC-NEXT: mov %g0, %o4
+; SPARC-NEXT: mov %g0, %o2
; SPARC-NEXT: .LBB0_10: ! %start
-; SPARC-NEXT: cmp %o1, 0
+; SPARC-NEXT: cmp %i4, 0
; SPARC-NEXT: bne .LBB0_12
-; SPARC-NEXT: mov %l3, %o1
+; SPARC-NEXT: mov %l3, %o3
; SPARC-NEXT: ! %bb.11: ! %start
-; SPARC-NEXT: mov %g0, %o1
+; SPARC-NEXT: mov %g0, %o3
; SPARC-NEXT: .LBB0_12: ! %start
-; SPARC-NEXT: cmp %l7, 0
+; SPARC-NEXT: cmp %l6, 0
; SPARC-NEXT: bne .LBB0_14
-; SPARC-NEXT: mov %l3, %l7
+; SPARC-NEXT: mov %l3, %l6
; SPARC-NEXT: ! %bb.13: ! %start
-; SPARC-NEXT: mov %g0, %l7
+; SPARC-NEXT: mov %g0, %l6
; SPARC-NEXT: .LBB0_14: ! %start
-; SPARC-NEXT: cmp %o2, %o0
-; SPARC-NEXT: bcs .LBB0_16
-; SPARC-NEXT: mov %l3, %g4
+; SPARC-NEXT: cmp %l5, 0
+; SPARC-NEXT: bne .LBB0_16
+; SPARC-NEXT: mov %l3, %l5
; SPARC-NEXT: ! %bb.15: ! %start
-; SPARC-NEXT: mov %g0, %g4
+; SPARC-NEXT: mov %g0, %l5
; SPARC-NEXT: .LBB0_16: ! %start
-; SPARC-NEXT: cmp %l4, 0
-; SPARC-NEXT: bne .LBB0_18
-; SPARC-NEXT: mov %l3, %l4
+; SPARC-NEXT: cmp %o0, %l7
+; SPARC-NEXT: bcs .LBB0_18
+; SPARC-NEXT: mov %l3, %g4
; SPARC-NEXT: ! %bb.17: ! %start
-; SPARC-NEXT: mov %g0, %l4
+; SPARC-NEXT: mov %g0, %g4
; SPARC-NEXT: .LBB0_18: ! %start
-; SPARC-NEXT: cmp %i0, 0
+; SPARC-NEXT: cmp %l1, 0
; SPARC-NEXT: bne .LBB0_20
-; SPARC-NEXT: mov %l3, %o0
+; SPARC-NEXT: mov %l3, %l1
; SPARC-NEXT: ! %bb.19: ! %start
-; SPARC-NEXT: mov %g0, %o0
+; SPARC-NEXT: mov %g0, %l1
; SPARC-NEXT: .LBB0_20: ! %start
-; SPARC-NEXT: cmp %l6, 0
+; SPARC-NEXT: cmp %i0, 0
; SPARC-NEXT: bne .LBB0_22
-; SPARC-NEXT: mov %l3, %l6
+; SPARC-NEXT: mov %l3, %o0
; SPARC-NEXT: ! %bb.21: ! %start
-; SPARC-NEXT: mov %g0, %l6
+; SPARC-NEXT: mov %g0, %o0
; SPARC-NEXT: .LBB0_22: ! %start
-; SPARC-NEXT: and %o4, %o3, %o2
-; SPARC-NEXT: cmp %l1, 0
-; SPARC-NEXT: and %o0, %l4, %o0
+; SPARC-NEXT: and %o3, %o2, %l7
+; SPARC-NEXT: cmp %l2, 0
+; SPARC-NEXT: and %o0, %l1, %l2
; SPARC-NEXT: bne .LBB0_24
-; SPARC-NEXT: mov %l3, %l1
+; SPARC-NEXT: mov %l3, %o0
; SPARC-NEXT: ! %bb.23: ! %start
-; SPARC-NEXT: mov %g0, %l1
+; SPARC-NEXT: mov %g0, %o0
; SPARC-NEXT: .LBB0_24: ! %start
-; SPARC-NEXT: or %o2, %o1, %l4
-; SPARC-NEXT: cmp %l2, %l0
-; SPARC-NEXT: or %o0, %l6, %l6
-; SPARC-NEXT: bcs .LBB0_26
-; SPARC-NEXT: mov %l3, %l0
+; SPARC-NEXT: or %l7, %l6, %l1
+; SPARC-NEXT: cmp %l0, 0
+; SPARC-NEXT: or %l2, %o0, %l2
+; SPARC-NEXT: bne .LBB0_26
+; SPARC-NEXT: mov %l3, %l6
; SPARC-NEXT: ! %bb.25: ! %start
-; SPARC-NEXT: mov %g0, %l0
+; SPARC-NEXT: mov %g0, %l6
; SPARC-NEXT: .LBB0_26: ! %start
-; SPARC-NEXT: or %l4, %l7, %l2
+; SPARC-NEXT: or %l1, %l5, %l0
; SPARC-NEXT: orcc %i5, %i4, %g0
-; SPARC-NEXT: or %l6, %l1, %l1
+; SPARC-NEXT: or %l2, %l6, %l1
; SPARC-NEXT: bne .LBB0_28
; SPARC-NEXT: mov %l3, %i4
; SPARC-NEXT: ! %bb.27: ! %start
; SPARC-NEXT: mov %g0, %i4
; SPARC-NEXT: .LBB0_28: ! %start
-; SPARC-NEXT: or %l2, %g4, %i5
+; SPARC-NEXT: or %l0, %o1, %i5
; SPARC-NEXT: orcc %i1, %i0, %g0
; SPARC-NEXT: bne .LBB0_30
-; SPARC-NEXT: or %l1, %l0, %i0
+; SPARC-NEXT: or %l1, %g4, %i0
; SPARC-NEXT: ! %bb.29: ! %start
; SPARC-NEXT: mov %g0, %l3
; SPARC-NEXT: .LBB0_30: ! %start
; SPARC-NEXT: and %l3, %i4, %i1
; SPARC-NEXT: or %i1, %i0, %i0
; SPARC-NEXT: or %i0, %i5, %i0
-; SPARC-NEXT: or %i0, %l5, %i0
+; SPARC-NEXT: or %i0, %l4, %i0
; SPARC-NEXT: and %i0, 1, %i4
; SPARC-NEXT: mov %g3, %i0
; SPARC-NEXT: ret
@@ -173,7 +173,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
; SPARC64-NEXT: call __multi3
; SPARC64-NEXT: mov %i3, %o3
; SPARC64-NEXT: mov %o0, %l0
-; SPARC64-NEXT: add %o1, %i5, %i0
+; SPARC64-NEXT: add %o1, %i5, %i5
; SPARC64-NEXT: mov %g0, %o0
; SPARC64-NEXT: mov %i1, %o1
; SPARC64-NEXT: mov %g0, %o2
@@ -181,19 +181,19 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
; SPARC64-NEXT: mov %i3, %o3
; SPARC64-NEXT: mov %g0, %i1
; SPARC64-NEXT: mov %g0, %i3
-; SPARC64-NEXT: mov %g0, %i5
; SPARC64-NEXT: mov %g0, %g2
; SPARC64-NEXT: mov %g0, %g3
-; SPARC64-NEXT: add %o0, %i0, %i0
-; SPARC64-NEXT: cmp %i0, %o0
+; SPARC64-NEXT: mov %g0, %g4
+; SPARC64-NEXT: add %o0, %i5, %i0
+; SPARC64-NEXT: cmp %i0, %i5
; SPARC64-NEXT: movrnz %l0, 1, %i3
-; SPARC64-NEXT: movrnz %i2, 1, %i5
-; SPARC64-NEXT: movrnz %l1, 1, %g2
+; SPARC64-NEXT: movrnz %i2, 1, %g2
+; SPARC64-NEXT: movrnz %l1, 1, %g3
; SPARC64-NEXT: movcs %xcc, 1, %i1
-; SPARC64-NEXT: and %g2, %i5, %i2
+; SPARC64-NEXT: and %g3, %g2, %i2
; SPARC64-NEXT: or %i2, %i3, %i2
-; SPARC64-NEXT: movrnz %i4, 1, %g3
-; SPARC64-NEXT: or %i2, %g3, %i2
+; SPARC64-NEXT: movrnz %i4, 1, %g4
+; SPARC64-NEXT: or %i2, %g4, %i2
; SPARC64-NEXT: or %i2, %i1, %i1
; SPARC64-NEXT: srl %i1, 0, %i2
; SPARC64-NEXT: ret
@@ -211,9 +211,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
; SPARC64-VIS3-NEXT: mov %g0, %g5
; SPARC64-VIS3-NEXT: mulx %i2, %i1, %i4
; SPARC64-VIS3-NEXT: mulx %i0, %i3, %l0
-; SPARC64-VIS3-NEXT: add %l0, %i4, %i4
-; SPARC64-VIS3-NEXT: umulxhi %i1, %i3, %l0
-; SPARC64-VIS3-NEXT: add %l0, %i4, %i4
+; SPARC64-VIS3-NEXT: add %l0, %i4, %l0
+; SPARC64-VIS3-NEXT: umulxhi %i1, %i3, %i4
+; SPARC64-VIS3-NEXT: add %i4, %l0, %i4
; SPARC64-VIS3-NEXT: cmp %i4, %l0
; SPARC64-VIS3-NEXT: movrnz %i2, 1, %g2
; SPARC64-VIS3-NEXT: movrnz %i0, 1, %g3
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index 1e537fe64c08d..eb1eb301dfd9c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -118,28 +118,28 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @uadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
; CHECK-LABEL: uadd_int64_t:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: adds r5, r2, r0
-; CHECK-NEXT: adc.w lr, r3, r1
-; CHECK-NEXT: subs r2, r5, r2
-; CHECK-NEXT: sbcs.w r2, lr, r3
-; CHECK-NEXT: vmov r3, r12, d2
-; CHECK-NEXT: vmov r1, r4, d0
-; CHECK-NEXT: csetm r2, lo
-; CHECK-NEXT: adds r3, r3, r1
-; CHECK-NEXT: adc.w r0, r4, r12
-; CHECK-NEXT: subs r1, r3, r1
-; CHECK-NEXT: sbcs.w r1, r0, r4
-; CHECK-NEXT: vmov q1[2], q1[0], r3, r5
+; CHECK-NEXT: adds.w lr, r2, r0
+; CHECK-NEXT: vmov r2, r4, d0
+; CHECK-NEXT: adc.w r12, r3, r1
+; CHECK-NEXT: subs.w r0, lr, r0
+; CHECK-NEXT: sbcs.w r0, r12, r1
+; CHECK-NEXT: vmov r1, r3, d2
+; CHECK-NEXT: csetm r0, lo
+; CHECK-NEXT: adds r2, r2, r1
+; CHECK-NEXT: adcs r4, r3
+; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: sbcs.w r1, r4, r3
+; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
; CHECK-NEXT: csetm r1, lo
-; CHECK-NEXT: vmov q1[3], q1[1], r0, lr
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r2
+; CHECK-NEXT: vmov q1[3], q1[1], r4, r12
+; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
; CHECK-NEXT: vorr q0, q1, q0
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2)
ret <2 x i64> %0
diff --git a/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
index 110fb2d43580a..373f4f2077b15 100644
--- a/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/WebAssembly/umulo-128-legalisation-lowering.ll
@@ -1,78 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=wasm32 -wasm-keep-registers | FileCheck %s --check-prefixes=WASM32
; NOTE: did not compile on wasm64 at the time the test was created!
define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
-; WASM32-LABEL: muloti_test
-; WASM32: global.get $push16=, __stack_pointer
-; WASM32: i32.const $push17=, 48
-; WASM32: i32.sub $push38=, $pop16, $pop17
-; WASM32: local.tee $push37=, 5, $pop38
-; WASM32: global.set __stack_pointer, $pop37
-; WASM32: local.get $push39=, 5
-; WASM32: i32.const $push22=, 32
-; WASM32: i32.add $push23=, $pop39, $pop22
-; WASM32: local.get $push41=, 1
-; WASM32: i64.const $push0=, 0
-; WASM32: local.get $push40=, 3
-; WASM32: i64.const $push36=, 0
-; WASM32: call __multi3, $pop23, $pop41, $pop0, $pop40, $pop36
-; WASM32: local.get $push42=, 5
-; WASM32: i32.const $push20=, 16
-; WASM32: i32.add $push21=, $pop42, $pop20
-; WASM32: local.get $push44=, 4
-; WASM32: i64.const $push35=, 0
-; WASM32: local.get $push43=, 1
-; WASM32: i64.const $push34=, 0
-; WASM32: call __multi3, $pop21, $pop44, $pop35, $pop43, $pop34
-; WASM32: local.get $push47=, 5
-; WASM32: local.get $push46=, 2
-; WASM32: i64.const $push33=, 0
-; WASM32: local.get $push45=, 3
-; WASM32: i64.const $push32=, 0
-; WASM32: call __multi3, $pop47, $pop46, $pop33, $pop45, $pop32
-; WASM32: local.get $push49=, 0
-; WASM32: local.get $push48=, 5
-; WASM32: i64.load $push1=, 32($pop48)
-; WASM32: i64.store 0($pop49), $pop1
-; WASM32: local.get $push53=, 0
-; WASM32: local.get $push50=, 5
-; WASM32: i64.load $push31=, 40($pop50)
-; WASM32: local.tee $push30=, 3, $pop31
-; WASM32: local.get $push51=, 5
-; WASM32: i64.load $push3=, 0($pop51)
-; WASM32: local.get $push52=, 5
-; WASM32: i64.load $push2=, 16($pop52)
-; WASM32: i64.add $push4=, $pop3, $pop2
-; WASM32: i64.add $push29=, $pop30, $pop4
-; WASM32: local.tee $push28=, 1, $pop29
-; WASM32: i64.store 8($pop53), $pop28
-; WASM32: local.get $push60=, 0
-; WASM32: local.get $push54=, 2
-; WASM32: i64.const $push27=, 0
-; WASM32: i64.ne $push6=, $pop54, $pop27
-; WASM32: local.get $push55=, 4
-; WASM32: i64.const $push26=, 0
-; WASM32: i64.ne $push5=, $pop55, $pop26
-; WASM32: i32.and $push7=, $pop6, $pop5
-; WASM32: local.get $push56=, 5
-; WASM32: i64.load $push8=, 8($pop56)
-; WASM32: i64.const $push25=, 0
-; WASM32: i64.ne $push9=, $pop8, $pop25
-; WASM32: i32.or $push10=, $pop7, $pop9
-; WASM32: local.get $push57=, 5
-; WASM32: i64.load $push11=, 24($pop57)
-; WASM32: i64.const $push24=, 0
-; WASM32: i64.ne $push12=, $pop11, $pop24
-; WASM32: i32.or $push13=, $pop10, $pop12
-; WASM32: local.get $push59=, 1
-; WASM32: local.get $push58=, 3
-; WASM32: i64.lt_u $push14=, $pop59, $pop58
-; WASM32: i32.or $push15=, $pop13, $pop14
-; WASM32: i32.store8 16($pop60), $pop15
-; WASM32: local.get $push61=, 5
-; WASM32: i32.const $push18=, 48
-; WASM32: i32.add $push19=, $pop61, $pop18
-; WASM32: global.set __stack_pointer, $pop19
+; WASM32-LABEL: muloti_test:
+; WASM32: .functype muloti_test (i32, i64, i64, i64, i64) -> ()
+; WASM32-NEXT: .local i32
+; WASM32-NEXT: # %bb.0: # %start
+; WASM32-NEXT: global.get $push16=, __stack_pointer
+; WASM32-NEXT: i32.const $push17=, 48
+; WASM32-NEXT: i32.sub $push38=, $pop16, $pop17
+; WASM32-NEXT: local.tee $push37=, 5, $pop38
+; WASM32-NEXT: global.set __stack_pointer, $pop37
+; WASM32-NEXT: local.get $push39=, 5
+; WASM32-NEXT: i32.const $push22=, 32
+; WASM32-NEXT: i32.add $push23=, $pop39, $pop22
+; WASM32-NEXT: local.get $push41=, 1
+; WASM32-NEXT: i64.const $push0=, 0
+; WASM32-NEXT: local.get $push40=, 3
+; WASM32-NEXT: i64.const $push36=, 0
+; WASM32-NEXT: call __multi3, $pop23, $pop41, $pop0, $pop40, $pop36
+; WASM32-NEXT: local.get $push42=, 5
+; WASM32-NEXT: i32.const $push20=, 16
+; WASM32-NEXT: i32.add $push21=, $pop42, $pop20
+; WASM32-NEXT: local.get $push44=, 4
+; WASM32-NEXT: i64.const $push35=, 0
+; WASM32-NEXT: local.get $push43=, 1
+; WASM32-NEXT: i64.const $push34=, 0
+; WASM32-NEXT: call __multi3, $pop21, $pop44, $pop35, $pop43, $pop34
+; WASM32-NEXT: local.get $push47=, 5
+; WASM32-NEXT: local.get $push46=, 2
+; WASM32-NEXT: i64.const $push33=, 0
+; WASM32-NEXT: local.get $push45=, 3
+; WASM32-NEXT: i64.const $push32=, 0
+; WASM32-NEXT: call __multi3, $pop47, $pop46, $pop33, $pop45, $pop32
+; WASM32-NEXT: local.get $push49=, 0
+; WASM32-NEXT: local.get $push48=, 5
+; WASM32-NEXT: i64.load $push1=, 32($pop48)
+; WASM32-NEXT: i64.store 0($pop49), $pop1
+; WASM32-NEXT: local.get $push53=, 0
+; WASM32-NEXT: local.get $push50=, 5
+; WASM32-NEXT: i64.load $push4=, 40($pop50)
+; WASM32-NEXT: local.get $push51=, 5
+; WASM32-NEXT: i64.load $push3=, 0($pop51)
+; WASM32-NEXT: local.get $push52=, 5
+; WASM32-NEXT: i64.load $push2=, 16($pop52)
+; WASM32-NEXT: i64.add $push31=, $pop3, $pop2
+; WASM32-NEXT: local.tee $push30=, 3, $pop31
+; WASM32-NEXT: i64.add $push29=, $pop4, $pop30
+; WASM32-NEXT: local.tee $push28=, 1, $pop29
+; WASM32-NEXT: i64.store 8($pop53), $pop28
+; WASM32-NEXT: local.get $push60=, 0
+; WASM32-NEXT: local.get $push54=, 2
+; WASM32-NEXT: i64.const $push27=, 0
+; WASM32-NEXT: i64.ne $push7=, $pop54, $pop27
+; WASM32-NEXT: local.get $push55=, 4
+; WASM32-NEXT: i64.const $push26=, 0
+; WASM32-NEXT: i64.ne $push6=, $pop55, $pop26
+; WASM32-NEXT: i32.and $push8=, $pop7, $pop6
+; WASM32-NEXT: local.get $push56=, 5
+; WASM32-NEXT: i64.load $push9=, 8($pop56)
+; WASM32-NEXT: i64.const $push25=, 0
+; WASM32-NEXT: i64.ne $push10=, $pop9, $pop25
+; WASM32-NEXT: i32.or $push11=, $pop8, $pop10
+; WASM32-NEXT: local.get $push57=, 5
+; WASM32-NEXT: i64.load $push12=, 24($pop57)
+; WASM32-NEXT: i64.const $push24=, 0
+; WASM32-NEXT: i64.ne $push13=, $pop12, $pop24
+; WASM32-NEXT: i32.or $push14=, $pop11, $pop13
+; WASM32-NEXT: local.get $push59=, 1
+; WASM32-NEXT: local.get $push58=, 3
+; WASM32-NEXT: i64.lt_u $push5=, $pop59, $pop58
+; WASM32-NEXT: i32.or $push15=, $pop14, $pop5
+; WASM32-NEXT: i32.store8 16($pop60), $pop15
+; WASM32-NEXT: local.get $push61=, 5
+; WASM32-NEXT: i32.const $push18=, 48
+; WASM32-NEXT: i32.add $push19=, $pop61, $pop18
+; WASM32-NEXT: global.set __stack_pointer, $pop19
+; WASM32-NEXT: # fallthrough-return
start:
%0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index dbfa69d497698..683442c88b43b 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -1782,12 +1782,11 @@ define <4 x i32> @vp_uadd_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i
; SSE-LABEL: vp_uadd_sat_v4i32:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm2, %xmm3
; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: por %xmm3, %xmm0
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: vp_uadd_sat_v4i32:
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index b12be7cb129d3..bb6104539108a 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -711,21 +711,19 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
;
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: pxor %xmm1, %xmm2
; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm0, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
; SSE42-NEXT: por %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -770,21 +768,19 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
;
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: pxor %xmm1, %xmm2
; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm0, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
; SSE42-NEXT: por %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775850,9223372036854775850]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -1251,18 +1247,17 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: pxor %xmm2, %xmm3
; SSE42-NEXT: paddq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm1
; SSE42-NEXT: pxor %xmm0, %xmm2
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm3
-; SSE42-NEXT: por %xmm3, %xmm0
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT: por %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index d744ce6ed6af0..20941042ea7a7 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -540,23 +540,21 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
@@ -608,23 +606,21 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-LABEL: v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i32:
@@ -676,37 +672,33 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: paddd %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: paddd %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm3
; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i32:
@@ -767,65 +759,57 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: v16i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
-; SSE2-NEXT: por %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm2
; SSE2-NEXT: paddd %xmm7, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm7
; SSE2-NEXT: pxor %xmm3, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
; SSSE3-NEXT: paddd %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
-; SSSE3-NEXT: por %xmm9, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm9
+; SSSE3-NEXT: pxor %xmm8, %xmm9
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: paddd %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: paddd %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm6
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm6, %xmm2
; SSSE3-NEXT: paddd %xmm7, %xmm3
+; SSSE3-NEXT: pxor %xmm8, %xmm7
; SSSE3-NEXT: pxor %xmm3, %xmm8
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
+; SSSE3-NEXT: por %xmm7, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v16i32:
@@ -897,26 +881,25 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE-LABEL: v2i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm2, %xmm3
; SSE-NEXT: paddq %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
@@ -926,7 +909,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
@@ -959,47 +942,45 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE-LABEL: v4i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pxor %xmm4, %xmm5
; SSE-NEXT: paddq %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: pcmpgtd %xmm5, %xmm6
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
; SSE-NEXT: pand %xmm7, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE-NEXT: por %xmm5, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm2
; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm3
; SSE-NEXT: pxor %xmm1, %xmm4
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE-NEXT: pand %xmm5, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE-NEXT: por %xmm2, %xmm1
+; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
@@ -1010,7 +991,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX2-LABEL: v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm1
@@ -1042,88 +1023,84 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE-LABEL: v8i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: movdqa %xmm0, %xmm9
-; SSE-NEXT: pxor %xmm8, %xmm9
; SSE-NEXT: paddq %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: pxor %xmm8, %xmm4
-; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: pcmpgtd %xmm4, %xmm10
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pxor %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm4, %xmm10
+; SSE-NEXT: pcmpgtd %xmm9, %xmm10
; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pcmpeqd %xmm4, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
; SSE-NEXT: pand %xmm11, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
; SSE-NEXT: por %xmm9, %xmm0
; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pxor %xmm8, %xmm4
; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: pxor %xmm8, %xmm5
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: pcmpgtd %xmm5, %xmm9
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pxor %xmm8, %xmm4
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pcmpgtd %xmm4, %xmm9
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE-NEXT: pand %xmm10, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
; SSE-NEXT: por %xmm5, %xmm1
; SSE-NEXT: por %xmm4, %xmm1
+; SSE-NEXT: paddq %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm8, %xmm6
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pxor %xmm8, %xmm4
-; SSE-NEXT: paddq %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pxor %xmm8, %xmm5
-; SSE-NEXT: movdqa %xmm4, %xmm6
-; SSE-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: pcmpgtd %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE-NEXT: pand %xmm9, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE-NEXT: por %xmm5, %xmm2
; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pxor %xmm8, %xmm4
; SSE-NEXT: paddq %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm8, %xmm7
; SSE-NEXT: pxor %xmm3, %xmm8
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: pcmpgtd %xmm8, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm4, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSE-NEXT: pand %xmm6, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm3
+; SSE-NEXT: movdqa %xmm7, %xmm4
+; SSE-NEXT: pcmpgtd %xmm8, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm7, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE-NEXT: por %xmm4, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
; SSE-NEXT: retq
;
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpaddq %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm4
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
@@ -1134,12 +1111,12 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX2-LABEL: v8i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm5
+; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2
; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2
+; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm3
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index be7888cd76a6b..4febf7d7128ce 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -43,50 +43,53 @@ define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: uaddo_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v2i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: pmaxud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: uaddo_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm1, (%rdi)
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpmaxud %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovq %xmm2, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: uaddo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpltud %xmm1, %xmm2, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vmovq %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -100,57 +103,60 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: uaddo_v3i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: movq %xmm1, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT: movd %xmm1, 8(%rdi)
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: movd %xmm0, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v3i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: movq %xmm1, (%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSSE3-NEXT: movd %xmm1, 8(%rdi)
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSSE3-NEXT: movd %xmm0, 8(%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v3i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: pmaxud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi)
-; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi)
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: uaddo_v3i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi)
-; AVX-NEXT: vmovq %xmm1, (%rdi)
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpmaxud %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrd $2, %xmm2, 8(%rdi)
+; AVX-NEXT: vmovq %xmm2, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: uaddo_v3i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpltud %xmm1, %xmm2, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi)
-; AVX512-NEXT: vmovq %xmm1, (%rdi)
+; AVX512-NEXT: vpextrd $2, %xmm2, 8(%rdi)
+; AVX512-NEXT: vmovq %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
%val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -164,50 +170,53 @@ define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: uaddo_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, (%rdi)
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, (%rdi)
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: pmaxud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm1, (%rdi)
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: uaddo_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpmaxud %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm2, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: uaddo_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpltud %xmm1, %xmm2, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
%val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
@@ -222,38 +231,38 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: movd %r8d, %xmm0
-; SSE2-NEXT: movd %ecx, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movd %edx, %xmm3
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movd %r8d, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: movd %edx, %xmm1
+; SSE2-NEXT: movd %esi, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: movd %r9d, %xmm2
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: movd %r9d, %xmm3
; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm3
; SSE2-NEXT: movq %xmm3, 16(%rcx)
; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: movq %xmm2, 16(%rdi)
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: movq %xmm1, 16(%rdi)
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: retq
;
@@ -261,38 +270,38 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %rax
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSSE3-NEXT: movd %r8d, %xmm0
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: movd %edx, %xmm3
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movd %r8d, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: movd %edx, %xmm1
+; SSSE3-NEXT: movd %esi, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSSE3-NEXT: movd %r9d, %xmm2
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT: movd %r9d, %xmm3
; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: paddd %xmm0, %xmm2
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm1, (%rcx)
-; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, (%rcx)
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: paddd %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: paddd %xmm1, %xmm3
; SSSE3-NEXT: movq %xmm3, 16(%rcx)
; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: movq %xmm2, 16(%rdi)
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT: movq %xmm1, 16(%rdi)
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: retq
;
@@ -312,60 +321,60 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: paddd %xmm0, %xmm3
-; SSE41-NEXT: pmaxud %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: paddd %xmm3, %xmm0
+; SSE41-NEXT: pmaxud %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: paddd %xmm2, %xmm1
-; SSE41-NEXT: pmaxud %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: movq %xmm1, 16(%rcx)
-; SSE41-NEXT: movdqa %xmm3, (%rcx)
-; SSE41-NEXT: movq %xmm2, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: pmaxud %xmm2, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: movq %xmm2, 16(%rcx)
+; SSE41-NEXT: movdqa %xmm0, (%rcx)
+; SSE41-NEXT: movq %xmm1, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm3, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v6i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm5
+; AVX1-NEXT: vpmaxud %xmm1, %xmm5, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovq %xmm3, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm5, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v6i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vmovq %xmm1, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v6i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpcmpltud %ymm1, %ymm2, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512-NEXT: vmovq %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
%val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -379,83 +388,89 @@ define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; SSE2-LABEL: uaddo_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, (%rdi)
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v8i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, (%rdi)
+; SSSE3-NEXT: paddd %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: paddd %xmm1, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: paddd %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v8i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: pmaxud %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: paddd %xmm2, %xmm0
+; SSE41-NEXT: pmaxud %xmm0, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: paddd %xmm1, %xmm3
-; SSE41-NEXT: pmaxud %xmm3, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm1
+; SSE41-NEXT: pmaxud %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE41-NEXT: pxor %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm5
+; AVX1-NEXT: vpmaxud %xmm1, %xmm5, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm5, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v8i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpcmpltud %ymm1, %ymm2, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX512-NEXT: vmovdqa %ymm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
%val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
@@ -469,143 +484,155 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; SSE2-LABEL: uaddo_v16i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, (%rdi)
+; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm5, 16(%rdi)
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm6, 32(%rdi)
+; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT: paddd %xmm6, %xmm2
; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pxor %xmm7, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm7, 48(%rdi)
+; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: paddd %xmm7, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v16i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa %xmm4, (%rdi)
+; SSSE3-NEXT: paddd %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0
-; SSSE3-NEXT: paddd %xmm1, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm5, 16(%rdi)
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: paddd %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm8, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
-; SSSE3-NEXT: paddd %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm6, 32(%rdi)
+; SSSE3-NEXT: movdqa %xmm1, 16(%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
+; SSSE3-NEXT: paddd %xmm6, %xmm2
; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
-; SSSE3-NEXT: paddd %xmm3, %xmm7
-; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: pxor %xmm7, %xmm8
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
-; SSSE3-NEXT: movdqa %xmm7, 48(%rdi)
+; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
+; SSSE3-NEXT: pxor %xmm8, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: paddd %xmm7, %xmm3
+; SSSE3-NEXT: pxor %xmm8, %xmm7
+; SSSE3-NEXT: pxor %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm7, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm0, %xmm4
-; SSE41-NEXT: pmaxud %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: paddd %xmm4, %xmm0
+; SSE41-NEXT: pmaxud %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: pcmpeqd %xmm8, %xmm8
-; SSE41-NEXT: pxor %xmm8, %xmm0
-; SSE41-NEXT: paddd %xmm1, %xmm5
-; SSE41-NEXT: pmaxud %xmm5, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT: pxor %xmm8, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm6
-; SSE41-NEXT: pmaxud %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm7
-; SSE41-NEXT: pmaxud %xmm7, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT: pxor %xmm8, %xmm3
-; SSE41-NEXT: movdqa %xmm7, 48(%rdi)
-; SSE41-NEXT: movdqa %xmm6, 32(%rdi)
-; SSE41-NEXT: movdqa %xmm5, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm4, (%rdi)
+; SSE41-NEXT: pxor %xmm8, %xmm4
+; SSE41-NEXT: paddd %xmm5, %xmm1
+; SSE41-NEXT: pmaxud %xmm1, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE41-NEXT: pxor %xmm8, %xmm5
+; SSE41-NEXT: paddd %xmm6, %xmm2
+; SSE41-NEXT: pmaxud %xmm2, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm8, %xmm6
+; SSE41-NEXT: paddd %xmm7, %xmm3
+; SSE41-NEXT: pmaxud %xmm3, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE41-NEXT: pxor %xmm7, %xmm8
+; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
+; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
+; SSE41-NEXT: movdqa %xmm1, 16(%rdi)
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm2
+; SSE41-NEXT: movdqa %xmm8, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpmaxud %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm6
+; AVX1-NEXT: vpmaxud %xmm3, %xmm6, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm6, %xmm1
; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpmaxud %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmaxud %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm7
+; AVX1-NEXT: vpmaxud %xmm2, %xmm7, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
-; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
+; AVX1-NEXT: vmovdqa %xmm6, 32(%rdi)
; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm7, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v16i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmaxud %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpmaxud %ymm3, %ymm4, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpmaxud %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm5
+; AVX2-NEXT: vpmaxud %ymm2, %ymm5, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
-; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm5, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v16i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpltud %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vpcmpltud %zmm1, %zmm2, %k1
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi)
+; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
%val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
@@ -618,19 +645,19 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; SSE2-LABEL: uaddo_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pmaxub %xmm0, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $31, %xmm4
-; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
@@ -639,25 +666,25 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $31, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: movdqa %xmm1, (%rdi)
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: paddb %xmm0, %xmm1
-; SSSE3-NEXT: pmaxub %xmm1, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT: paddb %xmm1, %xmm0
+; SSSE3-NEXT: pmaxub %xmm0, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: pxor %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pslld $31, %xmm4
-; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
@@ -666,22 +693,22 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pslld $31, %xmm3
; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, (%rdi)
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: pmaxub %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: pmaxub %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: pmovsxbd %xmm3, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE41-NEXT: pslld $31, %xmm4
-; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: pxor %xmm1, %xmm3
+; SSE41-NEXT: pmovsxbd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm2
@@ -690,14 +717,14 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; SSE41-NEXT: pslld $31, %xmm3
; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: movdqa %xmm1, (%rdi)
-; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpmaxub %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
@@ -716,7 +743,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
; AVX2-LABEL: uaddo_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpmaxub %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpmaxub %xmm1, %xmm2, %xmm0
; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1
@@ -728,10 +755,10 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: uaddo_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpltub %xmm1, %xmm2, %k1
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
%val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -744,84 +771,82 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
; SSE2-LABEL: uaddo_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $31, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: movdqa %xmm1, (%rdi)
-; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: paddw %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT: paddw %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtw %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: pslld $31, %xmm2
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, (%rdi)
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: pmaxuw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: paddw %xmm1, %xmm0
+; SSE41-NEXT: pmaxuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pmovsxwd %xmm2, %xmm0
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pslld $31, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: movdqa %xmm1, (%rdi)
-; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pmovsxwd %xmm1, %xmm2
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: movdqa %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpmaxuw %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpltuw %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpltuw %xmm1, %xmm2, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
@@ -835,25 +860,26 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; SSE-LABEL: uaddo_v2i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: paddq %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
; SSE-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3]
; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3]
-; SSE-NEXT: por %xmm3, %xmm0
-; SSE-NEXT: movdqa %xmm1, (%rdi)
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,3,3]
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: uaddo_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
@@ -864,7 +890,7 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
; AVX2-LABEL: uaddo_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
@@ -874,11 +900,11 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: uaddo_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpltuq %xmm1, %xmm2, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
>From dae0ff7a2f78ba086ac13777be0a081d6a3b59a9 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Thu, 11 Dec 2025 20:42:24 +0530
Subject: [PATCH 8/8] resolve merge conflict
---
llvm/test/CodeGen/RISCV/rvv/vp-splat.ll | 744 ------------------------
1 file changed, 744 deletions(-)
delete mode 100644 llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
deleted file mode 100644
index aeee1fa8215f0..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ /dev/null
@@ -1,744 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFH
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFH
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,NOZFMIN,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFHMIN
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZFMIN
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZFMIN
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVFBFA
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVFBFA
-
-define <vscale x 1 x i8> @vp_splat_nxv1i8(i8 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 1 x i8> @llvm.experimental.vp.splat.nxv1i8(i8 %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x i8> %splat
-}
-
-define <vscale x 2 x i8> @vp_splat_nxv2i8(i8 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 2 x i8> @llvm.experimental.vp.splat.nxv2i8(i8 %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x i8> %splat
-}
-
-define <vscale x 4 x i8> @vp_splat_nxv4i8(i8 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 4 x i8> @llvm.experimental.vp.splat.nxv4i8(i8 %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x i8> %splat
-}
-
-define <vscale x 8 x i8> @vp_splat_nxv8i8(i8 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 8 x i8> @llvm.experimental.vp.splat.nxv8i8(i8 %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x i8> %splat
-}
-
-define <vscale x 16 x i8> @vp_splat_nxv16i8(i8 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 16 x i8> @llvm.experimental.vp.splat.nxv16i8(i8 %val, <vscale x 16 x i1> %m, i32 %evl)
- ret <vscale x 16 x i8> %splat
-}
-
-define <vscale x 32 x i8> @vp_splat_nxv32i8(i8 %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv32i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 32 x i8> @llvm.experimental.vp.splat.nxv32i8(i8 %val, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x i8> %splat
-}
-
-define <vscale x 64 x i8> @vp_splat_nxv64i8(i8 %val, <vscale x 64 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv64i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 64 x i8> @llvm.experimental.vp.splat.nxv64i8(i8 %val, <vscale x 64 x i1> %m, i32 %evl)
- ret <vscale x 64 x i8> %splat
-}
-
-define <vscale x 1 x i16> @vp_splat_nxv1i16(i16 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 1 x i16> @llvm.experimental.vp.splat.nxv1i16(i16 %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x i16> %splat
-}
-
-define <vscale x 2 x i16> @vp_splat_nxv2i16(i16 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 2 x i16> @llvm.experimental.vp.splat.nxv2i16(i16 %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x i16> %splat
-}
-
-define <vscale x 4 x i16> @vp_splat_nxv4i16(i16 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 4 x i16> @llvm.experimental.vp.splat.nxv4i16(i16 %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x i16> %splat
-}
-
-define <vscale x 8 x i16> @vp_splat_nxv8i16(i16 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 8 x i16> @llvm.experimental.vp.splat.nxv8i16(i16 %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x i16> %splat
-}
-
-define <vscale x 16 x i16> @vp_splat_nxv16i16(i16 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 16 x i16> @llvm.experimental.vp.splat.nxv16i16(i16 %val, <vscale x 16 x i1> %m, i32 %evl)
- ret <vscale x 16 x i16> %splat
-}
-
-define <vscale x 32 x i16> @vp_splat_nxv32i16(i16 %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv32i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 32 x i16> @llvm.experimental.vp.splat.nxv32i16(i16 %val, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x i16> %splat
-}
-
-define <vscale x 1 x i32> @vp_splat_nxv1i32(i32 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 1 x i32> @llvm.experimental.vp.splat.nxv1i32(i32 %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x i32> %splat
-}
-
-define <vscale x 2 x i32> @vp_splat_nxv2i32(i32 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 2 x i32> @llvm.experimental.vp.splat.nxv2i32(i32 %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x i32> %splat
-}
-
-define <vscale x 4 x i32> @vp_splat_nxv4i32(i32 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 4 x i32> @llvm.experimental.vp.splat.nxv4i32(i32 %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x i32> %splat
-}
-
-define <vscale x 8 x i32> @vp_splat_nxv8i32(i32 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 8 x i32> @llvm.experimental.vp.splat.nxv8i32(i32 %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x i32> %splat
-}
-
-define <vscale x 16 x i32> @vp_splat_nxv16i32(i32 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 16 x i32> @llvm.experimental.vp.splat.nxv16i32(i32 %val, <vscale x 16 x i1> %m, i32 %evl)
- ret <vscale x 16 x i32> %splat
-}
-
-define <vscale x 1 x i64> @vp_splat_nxv1i64(i64 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vp_splat_nxv1i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: ret
- %splat = call <vscale x 1 x i64> @llvm.experimental.vp.splat.nxv1i64(i64 %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x i64> %splat
-}
-
-define <vscale x 2 x i64> @vp_splat_nxv2i64(i64 %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv2i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vp_splat_nxv2i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: ret
- %splat = call <vscale x 2 x i64> @llvm.experimental.vp.splat.nxv2i64(i64 %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x i64> %splat
-}
-
-define <vscale x 4 x i64> @vp_splat_nxv4i64(i64 %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv4i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vp_splat_nxv4i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: ret
- %splat = call <vscale x 4 x i64> @llvm.experimental.vp.splat.nxv4i64(i64 %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x i64> %splat
-}
-
-define <vscale x 8 x i64> @vp_splat_nxv8i64(i64 %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_nxv8i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: sw a1, 12(sp)
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vp_splat_nxv8i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: ret
- %splat = call <vscale x 8 x i64> @llvm.experimental.vp.splat.nxv8i64(i64 %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x i64> %splat
-}
-
-define <vscale x 1 x bfloat> @vp_splat_nxv1bf16(bfloat %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv1bf16:
-; NOZFMIN: # %bb.0:
-; NOZFMIN-NEXT: fmv.x.w a1, fa0
-; NOZFMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; NOZFMIN-NEXT: vmv.v.x v8, a1
-; NOZFMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv1bf16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv1bf16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf4, ta, ma
-; ZVFBFA-NEXT: vfmv.v.f v8, fa0
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 1 x bfloat> @llvm.experimental.vp.splat.nxv1bf16(bfloat %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x bfloat> %splat
-}
-
-define <vscale x 2 x bfloat> @vp_splat_nxv2bf16(bfloat %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv2bf16:
-; NOZFMIN: # %bb.0:
-; NOZFMIN-NEXT: fmv.x.w a1, fa0
-; NOZFMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; NOZFMIN-NEXT: vmv.v.x v8, a1
-; NOZFMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv2bf16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv2bf16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, mf2, ta, ma
-; ZVFBFA-NEXT: vfmv.v.f v8, fa0
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 2 x bfloat> @llvm.experimental.vp.splat.nxv2bf16(bfloat %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x bfloat> %splat
-}
-
-define <vscale x 4 x bfloat> @vp_splat_nxv4bf16(bfloat %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv4bf16:
-; NOZFMIN: # %bb.0:
-; NOZFMIN-NEXT: fmv.x.w a1, fa0
-; NOZFMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; NOZFMIN-NEXT: vmv.v.x v8, a1
-; NOZFMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv4bf16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv4bf16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m1, ta, ma
-; ZVFBFA-NEXT: vfmv.v.f v8, fa0
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 4 x bfloat> @llvm.experimental.vp.splat.nxv4bf16(bfloat %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x bfloat> %splat
-}
-
-define <vscale x 8 x bfloat> @vp_splat_nxv8bf16(bfloat %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv8bf16:
-; NOZFMIN: # %bb.0:
-; NOZFMIN-NEXT: fmv.x.w a1, fa0
-; NOZFMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; NOZFMIN-NEXT: vmv.v.x v8, a1
-; NOZFMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv8bf16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv8bf16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m2, ta, ma
-; ZVFBFA-NEXT: vfmv.v.f v8, fa0
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 8 x bfloat> @llvm.experimental.vp.splat.nxv8bf16(bfloat %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x bfloat> %splat
-}
-
-define <vscale x 16 x bfloat> @vp_splat_nxv16bf16(bfloat %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv16bf16:
-; NOZFMIN: # %bb.0:
-; NOZFMIN-NEXT: fmv.x.w a1, fa0
-; NOZFMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; NOZFMIN-NEXT: vmv.v.x v8, a1
-; NOZFMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv16bf16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv16bf16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m4, ta, ma
-; ZVFBFA-NEXT: vfmv.v.f v8, fa0
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 16 x bfloat> @llvm.experimental.vp.splat.nxv16bf16(bfloat %val, <vscale x 16 x i1> %m, i32 %evl)
- ret <vscale x 16 x bfloat> %splat
-}
-
-define <vscale x 32 x bfloat> @vp_splat_nxv32bf16(bfloat %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; NOZFMIN-LABEL: vp_splat_nxv32bf16:
-; NOZFMIN: # %bb.0:
-; NOZFMIN-NEXT: fmv.x.w a1, fa0
-; NOZFMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; NOZFMIN-NEXT: vmv.v.x v8, a1
-; NOZFMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv32bf16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv32bf16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: vsetvli zero, a0, e16alt, m8, ta, ma
-; ZVFBFA-NEXT: vfmv.v.f v8, fa0
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 32 x bfloat> @llvm.experimental.vp.splat.nxv32bf16(bfloat %val, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x bfloat> %splat
-}
-
-define <vscale x 1 x half> @vp_splat_nxv1f16(half %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv1f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFH-NEXT: vfmv.v.f v8, fa0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv1f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv1f16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv1f16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: fmv.x.h a1, fa0
-; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFBFA-NEXT: vmv.v.x v8, a1
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 1 x half> @llvm.experimental.vp.splat.nxv1f16(half %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x half> %splat
-}
-
-define <vscale x 2 x half> @vp_splat_nxv2f16(half %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv2f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT: vfmv.v.f v8, fa0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv2f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv2f16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv2f16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: fmv.x.h a1, fa0
-; ZVFBFA-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFBFA-NEXT: vmv.v.x v8, a1
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 2 x half> @llvm.experimental.vp.splat.nxv2f16(half %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x half> %splat
-}
-
-define <vscale x 4 x half> @vp_splat_nxv4f16(half %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv4f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; ZVFH-NEXT: vfmv.v.f v8, fa0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv4f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv4f16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv4f16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: fmv.x.h a1, fa0
-; ZVFBFA-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; ZVFBFA-NEXT: vmv.v.x v8, a1
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 4 x half> @llvm.experimental.vp.splat.nxv4f16(half %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x half> %splat
-}
-
-define <vscale x 8 x half> @vp_splat_nxv8f16(half %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv8f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; ZVFH-NEXT: vfmv.v.f v8, fa0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv8f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv8f16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv8f16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: fmv.x.h a1, fa0
-; ZVFBFA-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; ZVFBFA-NEXT: vmv.v.x v8, a1
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 8 x half> @llvm.experimental.vp.splat.nxv8f16(half %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x half> %splat
-}
-
-define <vscale x 16 x half> @vp_splat_nxv16f16(half %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv16f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT: vfmv.v.f v8, fa0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv16f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv16f16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv16f16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: fmv.x.h a1, fa0
-; ZVFBFA-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; ZVFBFA-NEXT: vmv.v.x v8, a1
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 16 x half> @llvm.experimental.vp.splat.nxv16f16(half %val, <vscale x 16 x i1> %m, i32 %evl)
- ret <vscale x 16 x half> %splat
-}
-
-define <vscale x 32 x half> @vp_splat_nxv32f16(half %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; ZVFH-LABEL: vp_splat_nxv32f16:
-; ZVFH: # %bb.0:
-; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT: vfmv.v.f v8, fa0
-; ZVFH-NEXT: ret
-;
-; ZVFHMIN-LABEL: vp_splat_nxv32f16:
-; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a1
-; ZVFHMIN-NEXT: ret
-;
-; ZFMIN-LABEL: vp_splat_nxv32f16:
-; ZFMIN: # %bb.0:
-; ZFMIN-NEXT: fmv.x.h a1, fa0
-; ZFMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZFMIN-NEXT: vmv.v.x v8, a1
-; ZFMIN-NEXT: ret
-;
-; ZVFBFA-LABEL: vp_splat_nxv32f16:
-; ZVFBFA: # %bb.0:
-; ZVFBFA-NEXT: fmv.x.h a1, fa0
-; ZVFBFA-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; ZVFBFA-NEXT: vmv.v.x v8, a1
-; ZVFBFA-NEXT: ret
- %splat = call <vscale x 32 x half> @llvm.experimental.vp.splat.nxv32f16(half %val, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x half> %splat
-}
-
-define <vscale x 1 x float> @vp_splat_nxv1f32(float %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 1 x float> @llvm.experimental.vp.splat.nxv1f32(float %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x float> %splat
-}
-
-define <vscale x 2 x float> @vp_splat_nxv2f32(float %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 2 x float> @llvm.experimental.vp.splat.nxv2f32(float %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x float> %splat
-}
-
-define <vscale x 4 x float> @vp_splat_nxv4f32(float %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 4 x float> @llvm.experimental.vp.splat.nxv4f32(float %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x float> %splat
-}
-
-define <vscale x 8 x float> @vp_splat_nxv8f32(float %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 8 x float> @llvm.experimental.vp.splat.nxv8f32(float %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x float> %splat
-}
-
-define <vscale x 16 x float> @vp_splat_nxv16f32(float %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 16 x float> @llvm.experimental.vp.splat.nxv16f32(float %val, <vscale x 16 x i1> %m, i32 %evl)
- ret <vscale x 16 x float> %splat
-}
-
-define <vscale x 1 x double> @vp_splat_nxv1f64(double %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv1f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 1 x double> @llvm.experimental.vp.splat.nxv1f64(double %val, <vscale x 1 x i1> %m, i32 %evl)
- ret <vscale x 1 x double> %splat
-}
-
-define <vscale x 2 x double> @vp_splat_nxv2f64(double %val, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv2f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 2 x double> @llvm.experimental.vp.splat.nxv2f64(double %val, <vscale x 2 x i1> %m, i32 %evl)
- ret <vscale x 2 x double> %splat
-}
-
-define <vscale x 4 x double> @vp_splat_nxv4f64(double %val, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv4f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 4 x double> @llvm.experimental.vp.splat.nxv4f64(double %val, <vscale x 4 x i1> %m, i32 %evl)
- ret <vscale x 4 x double> %splat
-}
-
-define <vscale x 8 x double> @vp_splat_nxv8f64(double %val, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv8f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vfmv.v.f v8, fa0
-; CHECK-NEXT: ret
- %splat = call <vscale x 8 x double> @llvm.experimental.vp.splat.nxv8f64(double %val, <vscale x 8 x i1> %m, i32 %evl)
- ret <vscale x 8 x double> %splat
-}
-
-define <vscale x 16 x i31> @vp_splat_nxv16i31(i31 %val, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv16i31:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 16 x i31> @llvm.experimental.vp.splat.nxv16i31(i31 %val, <vscale x 16 x i1> %m, i32 %evl)
- ret <vscale x 16 x i31> %splat
-}
-
-define <vscale x 15 x i32> @vp_splat_nxv15i32(i32 %val, <vscale x 15 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv15i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 15 x i32> @llvm.experimental.vp.splat.nxv15i32(i32 %val, <vscale x 15 x i1> %m, i32 %evl)
- ret <vscale x 15 x i32> %splat
-}
-
-; Split case.
-define <vscale x 32 x i32> @vp_splat_nxv32i32(i32 %val, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vp_splat_nxv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 1
-; CHECK-NEXT: sub a3, a1, a2
-; CHECK-NEXT: sltu a4, a2, a1
-; CHECK-NEXT: addi a4, a4, -1
-; CHECK-NEXT: and a3, a4, a3
-; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v16, a0
-; CHECK-NEXT: bltu a1, a2, .LBB45_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a1, a2
-; CHECK-NEXT: .LBB45_2:
-; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: ret
- %splat = call <vscale x 32 x i32> @llvm.experimental.vp.splat.nxv32i32(i32 %val, <vscale x 32 x i1> %m, i32 %evl)
- ret <vscale x 32 x i32> %splat
-}
More information about the llvm-commits
mailing list