[llvm] [LSV] Insert casts to vectorize mismatched types (PR #134436)
Anshil Gandhi via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 5 02:18:17 PDT 2025
https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/134436
>From 4bc8fa8859ceb4a22a0b9889b17395972e152da3 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 29 Mar 2025 18:14:27 -0400
Subject: [PATCH 1/5] [NFC][LSV] Precommit tests
This commit adds tests to introduce bitcasts
to vectorize loads and stores.
---
.../AMDGPU/merge-vectors.ll | 29 +++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index ede2e4066c263..b9a948f46ea3b 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -111,3 +111,32 @@ entry:
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+
+; CHECK-LABEL: @merge_i32_2i16_float_4i8(
+; CHECK: load i32
+; CHECK: load <2 x i16>
+; CHECK: load float
+; CHECK: load <4 x i8>
+; CHECK: store i32
+; CHECK: store <2 x i16>
+; CHECK: store float
+; CHECK: store <4 x i8>
+define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
+ %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1
+ %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
+ %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2
+ %load3 = load float, ptr addrspace(1) %gep3, align 4
+ %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3
+ %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
+ %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0
+ store i32 %load1, ptr addrspace(2) %store.gep1, align 4
+ %store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1
+ store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4
+ %store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2
+ store float %load3, ptr addrspace(2) %store.gep3, align 4
+ %store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3
+ store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4
+ ret void
+}
>From 42a70c79317235dfacdc06cb4427a416e8fd294d Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 29 Mar 2025 18:14:27 -0400
Subject: [PATCH 2/5] [LSV] Precommit tests
This commit adds tests to introduce bitcasts
for increased vectorization of loads and stores.
NFC.
---
.../AMDGPU/insert-casts-vectorize.ll | 89 +++++++++++++++++++
1 file changed, 89 insertions(+)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
new file mode 100644
index 0000000000000..a1bccd4665414
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - < %s | FileCheck %s
+
+define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @merge_i32_2i16_float_4i8(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR]], i64 2
+; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR]], i64 3
+; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr, i64 1
+ %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
+ %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr, i64 2
+ %load3 = load float, ptr addrspace(1) %gep3, align 4
+ %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr, i64 3
+ %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
+ ret void
+}
+
+define void @no_merge_i32_i16(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @no_merge_i32_i16(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 1
+; CHECK-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(1) [[GEP2]], align 4
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 0
+ %load1 = load i32, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 1
+ %load2 = load i16, ptr addrspace(1) %gep2, align 4
+ ret void
+}
+
+define void @merge_i64_double_ptr(ptr addrspace(1) %ptr, ptr addrspace(2) %ptr2) {
+; CHECK-LABEL: define void @merge_i64_double_ptr(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i64>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <3 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <3 x i64> [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[LOAD22]] to double
+; CHECK-NEXT: [[LOAD33:%.*]] = extractelement <3 x i64> [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[LOAD33]] to ptr
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LOAD11]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP2]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP5]], i32 1
+; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds i64, ptr addrspace(1) %ptr, i64 0
+ %gep2 = getelementptr inbounds double, ptr addrspace(1) %ptr, i64 1
+ %gep3 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 2
+ %load1 = load i64, ptr addrspace(1) %gep1, align 4
+ %load2 = load double, ptr addrspace(1) %gep2, align 4
+ %load3 = load ptr, ptr addrspace(1) %gep3, align 4
+ store i64 %load1, ptr addrspace(1) %gep1, align 4
+ store double %load2, ptr addrspace(1) %gep2, align 4
+ store ptr %load3, ptr addrspace(1) %gep3, align 4
+ ret void
+}
+
+define void @merge_i16_half(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: define void @merge_i16_half(
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
+; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[LOAD22]] to half
+; CHECK-NEXT: ret void
+;
+ %gep1 = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 0
+ %load1 = load i16, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds half, ptr addrspace(1) %ptr, i64 1
+ %load2 = load half, ptr addrspace(1) %gep2, align 4
+ ret void
+}
>From 49d9fabc026a9975a63b96e7fe27c383f72a490d Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 29 Mar 2025 18:14:27 -0400
Subject: [PATCH 3/5] [LSV] Insert casts to vectorize mismatched types
After collecting equivalence classes, loop over
each distinct pair of them and check if they could
be merged into one.
Consider classes A and B such that their leaders
differ only by their scalar bitwidth. (We do not
merge them otherwise.) Let N be the scalar
bitwidth of the leader instruction in A. Iterate
over all instructions in B and ensure their total
bitwidths match the total bitwidth of the leader
instruction of A. Finally, cast each instruction
in B with a mismatched type to an intN type.
---
.../Vectorize/LoadStoreVectorizer.cpp | 82 ++++++++++++++++-
.../AMDGPU/insert-casts-vectorize.ll | 89 -------------------
.../AMDGPU/merge-vectors.ll | 17 ++--
3 files changed, 87 insertions(+), 101 deletions(-)
delete mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 04b392829f0d7..c94f10fb8b855 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -324,6 +324,10 @@ class Vectorizer {
Instruction *ChainElem, Instruction *ChainBegin,
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
+ /// Merge the equivalence classes if casts could be inserted in one to match
+ /// the scalar bitwidth of the instructions in the other class.
+ void insertCastsToMergeClasses(EquivalenceClassMap &EQClasses);
+
/// Merges the equivalence classes if they have underlying objects that differ
/// by one level of indirection (i.e., one is a getelementptr and the other is
/// the base pointer in that getelementptr).
@@ -1310,6 +1314,82 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
return std::nullopt;
}
+void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
+ if (EQClasses.size() < 2)
+ return;
+
+ // Loop over all equivalence classes and try to merge them. Keep track of
+ // classes that are merged into others.
+ DenseSet<EqClassKey> ClassesToErase;
+ for (auto EC1 : EQClasses) {
+ for (auto EC2 : EQClasses) {
+ if (ClassesToErase.contains(EC2.first) || EC1 <= EC2)
+ continue;
+
+ auto [Ptr1, AS1, TySize1, IsLoad1] = EC1.first;
+ auto [Ptr2, AS2, TySize2, IsLoad2] = EC2.first;
+
+ // Attempt to merge EC2 into EC1. Skip if the pointers, address spaces or
+ // whether the leader instruction is a load/store are different. Also skip
+ // if the scalar bitwidth of the first equivalence class is smaller than
+ // the second one to avoid reconsidering the same equivalence class pair.
+ if (Ptr1 != Ptr2 || AS1 != AS2 || IsLoad1 != IsLoad2 || TySize1 < TySize2)
+ continue;
+
+ // Ensure all instructions in EC2 can be bitcasted into NewTy.
+ /// TODO: NewTyBits is needed as stuctured binded variables cannot be
+ /// captured by a lambda until C++20.
+ auto NewTyBits = std::get<2>(EC1.first);
+ if (any_of(EC2.second, [&](Instruction *I) {
+ return DL.getTypeSizeInBits(getLoadStoreType(I)) != NewTyBits;
+ }))
+ continue;
+
+ // Create a new type for the equivalence class.
+ /// TODO: NewTy should be an FP type for an all-FP equivalence class.
+ auto *NewTy = Type::getIntNTy(EC2.second[0]->getContext(), NewTyBits);
+ for (auto *Inst : EC2.second) {
+ auto *Ptr = getLoadStorePointerOperand(Inst);
+ auto *OrigTy = Inst->getType();
+ if (OrigTy == NewTy)
+ continue;
+ if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+ Builder.SetInsertPoint(LI->getIterator());
+ auto *NewLoad = Builder.CreateLoad(NewTy, Ptr);
+ auto *Cast = Builder.CreateBitOrPointerCast(
+ NewLoad, OrigTy, NewLoad->getName() + ".cast");
+ LI->replaceAllUsesWith(Cast);
+ LI->eraseFromParent();
+ EQClasses[EC1.first].emplace_back(NewLoad);
+ } else {
+ auto *SI = cast<StoreInst>(Inst);
+ Builder.SetInsertPoint(SI->getIterator());
+ auto *Cast = Builder.CreateBitOrPointerCast(
+ SI->getValueOperand(), NewTy,
+ SI->getValueOperand()->getName() + ".cast");
+ auto *NewStore = Builder.CreateStore(
+ Cast, getLoadStorePointerOperand(SI), SI->isVolatile());
+ SI->eraseFromParent();
+ EQClasses[EC1.first].emplace_back(NewStore);
+ }
+ }
+
+ // Sort the instructions in the equivalence class by their order in the
+ // basic block. This is important to ensure that the instructions are
+ // vectorized in the correct order.
+ std::sort(EQClasses[EC1.first].begin(), EQClasses[EC1.first].end(),
+ [](Instruction *A, Instruction *B) {
+ return A && B && A->comesBefore(B);
+ });
+ ClassesToErase.insert(EC2.first);
+ }
+ }
+
+ // Erase the equivalence classes that were merged into others.
+ for (auto Key : ClassesToErase)
+ EQClasses.erase(Key);
+}
+
void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
if (EQClasses.size() < 2) // There is nothing to merge.
return;
@@ -1495,7 +1575,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
/*IsLoad=*/LI != nullptr}]
.emplace_back(&I);
}
-
+ insertCastsToMergeClasses(Ret);
mergeEquivalenceClasses(Ret);
return Ret;
}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
deleted file mode 100644
index a1bccd4665414..0000000000000
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insert-casts-vectorize.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - < %s | FileCheck %s
-
-define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: define void @merge_i32_2i16_float_4i8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR]], i64 2
-; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4
-; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR]], i64 3
-; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 0
- %load1 = load i32, ptr addrspace(1) %gep1, align 4
- %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr, i64 1
- %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4
- %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr, i64 2
- %load3 = load float, ptr addrspace(1) %gep3, align 4
- %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr, i64 3
- %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4
- ret void
-}
-
-define void @no_merge_i32_i16(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: define void @no_merge_i32_i16(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 1
-; CHECK-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(1) [[GEP2]], align 4
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 0
- %load1 = load i32, ptr addrspace(1) %gep1, align 4
- %gep2 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 1
- %load2 = load i16, ptr addrspace(1) %gep2, align 4
- ret void
-}
-
-define void @merge_i64_double_ptr(ptr addrspace(1) %ptr, ptr addrspace(2) %ptr2) {
-; CHECK-LABEL: define void @merge_i64_double_ptr(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[PTR]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i64>, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <3 x i64> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <3 x i64> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[LOAD22]] to double
-; CHECK-NEXT: [[LOAD33:%.*]] = extractelement <3 x i64> [[TMP1]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[LOAD33]] to ptr
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LOAD11]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP2]] to i64
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP5]], i32 1
-; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(1) [[GEP3]], align 4
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds i64, ptr addrspace(1) %ptr, i64 0
- %gep2 = getelementptr inbounds double, ptr addrspace(1) %ptr, i64 1
- %gep3 = getelementptr inbounds ptr, ptr addrspace(1) %ptr, i64 2
- %load1 = load i64, ptr addrspace(1) %gep1, align 4
- %load2 = load double, ptr addrspace(1) %gep2, align 4
- %load3 = load ptr, ptr addrspace(1) %gep3, align 4
- store i64 %load1, ptr addrspace(1) %gep1, align 4
- store double %load2, ptr addrspace(1) %gep2, align 4
- store ptr %load3, ptr addrspace(1) %gep3, align 4
- ret void
-}
-
-define void @merge_i16_half(ptr addrspace(1) %ptr) {
-; CHECK-LABEL: define void @merge_i16_half(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[PTR]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP1]], align 4
-; CHECK-NEXT: [[LOAD11:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; CHECK-NEXT: [[LOAD22:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[LOAD22]] to half
-; CHECK-NEXT: ret void
-;
- %gep1 = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 0
- %load1 = load i16, ptr addrspace(1) %gep1, align 4
- %gep2 = getelementptr inbounds half, ptr addrspace(1) %ptr, i64 1
- %load2 = load half, ptr addrspace(1) %gep2, align 4
- ret void
-}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index b9a948f46ea3b..c364bc2da4c5d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -95,10 +95,10 @@ entry:
ret void
}
-; Ideally this would be merged
; CHECK-LABEL: @merge_load_i32_v2i16(
-; CHECK: load i32,
-; CHECK: load <2 x i16>
+; CHECK: load <2 x i32>
+; CHECK: extractelement <2 x i32> %0, i32 0
+; CHECK: extractelement <2 x i32> %0, i32 1
define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 {
entry:
%a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1
@@ -113,14 +113,9 @@ attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
; CHECK-LABEL: @merge_i32_2i16_float_4i8(
-; CHECK: load i32
-; CHECK: load <2 x i16>
-; CHECK: load float
-; CHECK: load <4 x i8>
-; CHECK: store i32
-; CHECK: store <2 x i16>
-; CHECK: store float
-; CHECK: store <4 x i8>
+; CHECK: load <4 x i32>
+; CHECK: store <2 x i32>
+; CHECK: store <2 x i32>
define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
%gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0
%load1 = load i32, ptr addrspace(1) %gep1, align 4
>From 11ddce21f3408b250ef9f8e44c3255a04bd67550 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 5 Apr 2025 09:33:34 +0200
Subject: [PATCH 4/5] [LSV] Decide the type to cast instructions into
For classes where all instructions are integers,
FP or pointers, it is rather efficient to bitcast
them into that type instead of a generic intN type.
This commit preprocesses the equivalence classes
and maintains a bitvector of 3 bits which are set
if the corresponding class contains instructions
of type integer, FP or pointer, respectively.
---
.../Vectorize/LoadStoreVectorizer.cpp | 38 ++++++++++++++++++-
.../AMDGPU/merge-vectors.ll | 11 ++++++
2 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c94f10fb8b855..8a3c9f55f1465 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -60,6 +60,7 @@
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Bitset.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
@@ -1318,6 +1319,28 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
if (EQClasses.size() < 2)
return;
+ // For each class, determine if all instructions are of type int, FP or ptr.
+ // This information will help us determine the type instructions should be
+ // casted into.
+ MapVector<EqClassKey, Bitset<3>> ClassAllTy;
+ for (auto C : EQClasses) {
+ if (all_of(EQClasses[C.first],
+ [](Instruction *I) {
+ return I->getType()->isIntOrIntVectorTy();
+ }))
+ ClassAllTy[C.first].set(0);
+ else if (all_of(EQClasses[C.first],
+ [](Instruction *I) {
+ return I->getType()->isFPOrFPVectorTy();
+ }))
+ ClassAllTy[C.first].set(1);
+ else if (all_of(EQClasses[C.first],
+ [](Instruction *I) {
+ return I->getType()->isPtrOrPtrVectorTy();
+ }))
+ ClassAllTy[C.first].set(2);
+ }
+
// Loop over all equivalence classes and try to merge them. Keep track of
// classes that are merged into others.
DenseSet<EqClassKey> ClassesToErase;
@@ -1346,8 +1369,19 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
continue;
// Create a new type for the equivalence class.
- /// TODO: NewTy should be an FP type for an all-FP equivalence class.
- auto *NewTy = Type::getIntNTy(EC2.second[0]->getContext(), NewTyBits);
+ auto &Ctx = EC2.second[0]->getContext();
+ Type *NewTy = Type::getIntNTy(EC2.second[0]->getContext(), NewTyBits);
+ if (ClassAllTy[EC1.first].test(1) && ClassAllTy[EC2.first].test(1)) {
+ if (NewTyBits == 16)
+ NewTy = Type::getHalfTy(Ctx);
+ else if (NewTyBits == 32)
+ NewTy = Type::getFloatTy(Ctx);
+ else if (NewTyBits == 64)
+ NewTy = Type::getDoubleTy(Ctx);
+ } else if (ClassAllTy[EC1.first].test(2) && ClassAllTy[EC2.first].test(2)) {
+ NewTy = PointerType::get(Ctx, AS2);
+ }
+
for (auto *Inst : EC2.second) {
auto *Ptr = getLoadStorePointerOperand(Inst);
auto *OrigTy = Inst->getType();
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index c364bc2da4c5d..caf5be69c7ab8 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -135,3 +135,14 @@ define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %
store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4
ret void
}
+
+; CHECK-LABEL: @merge_fp_type(
+; CHECK: load <2 x float>
+; CHECK: bitcast float {{.*}} to <2 x half>
+define void @merge_fp_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) {
+ %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0
+ %load1 = load float, ptr addrspace(1) %gep1, align 4
+ %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %ptr1, i64 1
+ %load2 = load <2 x half>, ptr addrspace(1) %gep2, align 4
+ ret void
+}
\ No newline at end of file
>From d333a4d0342a8ff60e64bcee90e5666911dad9f5 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <gandhi21299 at gmail.com>
Date: Sat, 5 Apr 2025 11:17:15 +0200
Subject: [PATCH 5/5] [LSV] Fix codegen tests due to previous commit
---
llvm/test/CodeGen/AMDGPU/bitop3.ll | 26 +-
.../AMDGPU/buffer-intrinsics-mmo-offsets.ll | 145 ++++----
llvm/test/CodeGen/AMDGPU/build_vector.ll | 14 +-
.../AMDGPU/dag-preserve-disjoint-flag.ll | 30 +-
llvm/test/CodeGen/AMDGPU/divrem24-assume.ll | 2 +-
.../CodeGen/AMDGPU/expand-variadic-call.ll | 42 +--
.../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 41 +--
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 28 +-
llvm/test/CodeGen/AMDGPU/fabs.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 289 +++++++--------
llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 54 +--
llvm/test/CodeGen/AMDGPU/fdiv.ll | 333 +++++++++---------
llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 10 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fshl.ll | 192 +++++-----
llvm/test/CodeGen/AMDGPU/fshr.ll | 195 +++++-----
llvm/test/CodeGen/AMDGPU/half.ll | 156 ++++----
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 54 +--
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 8 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 24 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 86 ++---
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 86 ++---
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 36 +-
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 76 ++--
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 76 ++--
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 18 +-
llvm/test/CodeGen/AMDGPU/min.ll | 196 +++++------
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 2 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 68 ++--
llvm/test/CodeGen/AMDGPU/rotr.ll | 52 +--
llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 50 +--
llvm/test/CodeGen/AMDGPU/udivrem.ll | 111 +++---
llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 2 +-
34 files changed, 1239 insertions(+), 1305 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
index eb149a93ee328..a453fbdd97ca0 100644
--- a/llvm/test/CodeGen/AMDGPU/bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -113,15 +113,10 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
}
define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
-; GFX950-SDAG-LABEL: and_and_and:
-; GFX950-SDAG: ; %bb.0:
-; GFX950-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
-; GFX950-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX950-GISEL-LABEL: and_and_and:
-; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
-; GFX950-GISEL-NEXT: ; return to shader part epilog
+; GCN-LABEL: and_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GCN-NEXT: ; return to shader part epilog
%and1 = and i32 %a, %c
%and2 = and i32 %and1, %b
%ret_cast = bitcast i32 %and2 to float
@@ -131,15 +126,10 @@ define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
; ========= Multi bit functions =========
define amdgpu_ps float @test_12(i32 %a, i32 %b) {
-; GFX950-SDAG-LABEL: test_12:
-; GFX950-SDAG: ; %bb.0:
-; GFX950-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
-; GFX950-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX950-GISEL-LABEL: test_12:
-; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
-; GFX950-GISEL-NEXT: ; return to shader part epilog
+; GCN-LABEL: test_12:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GCN-NEXT: ; return to shader part epilog
%nota = xor i32 %a, -1
%and1 = and i32 %nota, %b
%ret_cast = bitcast i32 %and1 to float
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index 384beae07ce2e..1ec4da9d35605 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -11,7 +11,6 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
- ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg0, addrspace 6)
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0, align 16, addrspace 6)
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 8, 0 :: (dereferenceable invariant load (s64) from %ir.arg0 + 8, basealign 16, addrspace 6)
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
@@ -19,14 +18,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[COPY3]], %subreg.sub2, killed [[COPY2]], %subreg.sub3
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
@@ -35,14 +34,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
@@ -51,14 +50,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY10]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[COPY10]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
@@ -68,14 +67,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
@@ -84,14 +83,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[REG_SEQUENCE1]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[REG_SEQUENCE1]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
@@ -100,14 +99,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE1]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[REG_SEQUENCE1]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[REG_SEQUENCE1]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
@@ -117,21 +116,21 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE1]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[REG_SEQUENCE1]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[REG_SEQUENCE1]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
@@ -147,19 +146,19 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[REG_SEQUENCE1]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[REG_SEQUENCE1]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
@@ -175,18 +174,18 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
; GCN-NEXT: [[COPY40:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY42:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY41]], [[S_LOAD_DWORDX4_IMM]], [[COPY42]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY41]], [[REG_SEQUENCE1]], [[COPY42]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY43]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
@@ -202,19 +201,19 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY48:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY48]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY48]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
; GCN-NEXT: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY49]], [[REG_SEQUENCE1]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
; GCN-NEXT: [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY50]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY52:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY51]], [[REG_SEQUENCE1]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY53:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY53]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
@@ -230,18 +229,18 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
; GCN-NEXT: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY62:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[REG_SEQUENCE1]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
@@ -257,19 +256,19 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
; GCN-NEXT: [[COPY69:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[REG_SEQUENCE1]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
; GCN-NEXT: [[COPY70:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY71:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[REG_SEQUENCE1]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY73:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY74:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eaeff8eb1..a5bd004db9ea7 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -271,13 +271,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_lshl_b32 s0, s3, 16
+; GFX8-NEXT: s_lshl_b32 s1, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
index 4b4718a2acb80..6f32d1dcb1462 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
@@ -10,8 +10,8 @@ define amdgpu_ps i32 @s_or_i32_disjoint(i32 inreg %a, i32 inreg %b) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; CHECK-NEXT: %3:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc
- ; CHECK-NEXT: $sgpr0 = COPY %3
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc
+ ; CHECK-NEXT: $sgpr0 = COPY [[S_OR_B32_]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0
%result = or disjoint i32 %a, %b
ret i32 %result
@@ -26,10 +26,10 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; CHECK-NEXT: %5:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
- ; CHECK-NEXT: %6:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
- ; CHECK-NEXT: $sgpr0 = COPY %5
- ; CHECK-NEXT: $sgpr1 = COPY %6
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
+ ; CHECK-NEXT: $sgpr0 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_1]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%result = or disjoint <2 x i32> %a, %b
ret <2 x i32> %result
@@ -42,8 +42,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY %10
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%result = or disjoint i32 %a, %b
ret i32 %result
@@ -58,10 +58,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
- ; CHECK-NEXT: %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY %12
- ; CHECK-NEXT: $vgpr1 = COPY %13
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
+ ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%result = or disjoint <2 x i32> %a, %b
ret <2 x i32> %result
@@ -78,9 +78,9 @@ define amdgpu_ps i64 @s_or_i64_disjoint(i64 inreg %a, i64 inreg %b) {
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; CHECK-NEXT: %7:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY %7.sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY %7.sub0
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
; CHECK-NEXT: $sgpr0 = COPY [[COPY5]]
; CHECK-NEXT: $sgpr1 = COPY [[COPY4]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll b/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
index dc79385d9eaca..95ca77669c4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
+++ b/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
@@ -4,7 +4,7 @@
define amdgpu_kernel void @divrem24_assume(ptr addrspace(1) %arg, i32 %arg1) {
; CHECK-LABEL: @divrem24_assume(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG0:![0-9]+]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[ARG1:%.*]], 42
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]])
; CHECK-NEXT: [[TMP0:%.*]] = uitofp i32 [[TMP]] to float
diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
index cca70005b4cdc..488a5ed3f7663 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
@@ -281,17 +281,17 @@ define hidden void @i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32
; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %i32_libcS.vararg, align 4, addrspace(5)
-; CHECK-NEXT: %.fca.0.insert = insertvalue %struct.libcS poison, i8 %y.coerce0, 0
-; CHECK-NEXT: %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %y.coerce1, 1
-; CHECK-NEXT: %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %y.coerce2, 2
-; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %y.coerce3, 3
-; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %y.coerce4, 4
-; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %y.coerce5, 5
+; CHECK-NEXT: %dotfca.0.insert = insertvalue %struct.libcS poison, i8 %y.coerce0, 0
+; CHECK-NEXT: %dotfca.1.insert = insertvalue %struct.libcS %dotfca.0.insert, i16 %y.coerce1, 1
+; CHECK-NEXT: %dotfca.2.insert = insertvalue %struct.libcS %dotfca.1.insert, i32 %y.coerce2, 2
+; CHECK-NEXT: %dotfca.3.insert = insertvalue %struct.libcS %dotfca.2.insert, i64 %y.coerce3, 3
+; CHECK-NEXT: %dotfca.4.insert = insertvalue %struct.libcS %dotfca.3.insert, float %y.coerce4, 4
+; CHECK-NEXT: %dotfca.5.insert = insertvalue %struct.libcS %dotfca.4.insert, double %y.coerce5, 5
; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = getelementptr inbounds nuw %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
-; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %1, align 8
+; CHECK-NEXT: store %struct.libcS %dotfca.5.insert, ptr addrspace(5) %1, align 8
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
@@ -312,15 +312,15 @@ define hidden void @libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64
; CHECK-LABEL: define {{[^@]+}}@libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %libcS_i32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: %.fca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0
-; CHECK-NEXT: %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %x.coerce1, 1
-; CHECK-NEXT: %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %x.coerce2, 2
-; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3
-; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4
-; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5
+; CHECK-NEXT: %dotfca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0
+; CHECK-NEXT: %dotfca.1.insert = insertvalue %struct.libcS %dotfca.0.insert, i16 %x.coerce1, 1
+; CHECK-NEXT: %dotfca.2.insert = insertvalue %struct.libcS %dotfca.1.insert, i32 %x.coerce2, 2
+; CHECK-NEXT: %dotfca.3.insert = insertvalue %struct.libcS %dotfca.2.insert, i64 %x.coerce3, 3
+; CHECK-NEXT: %dotfca.4.insert = insertvalue %struct.libcS %dotfca.3.insert, float %x.coerce4, 4
+; CHECK-NEXT: %dotfca.5.insert = insertvalue %struct.libcS %dotfca.4.insert, double %x.coerce5, 5
; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
-; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %0, align 8
+; CHECK-NEXT: store %struct.libcS %dotfca.5.insert, ptr addrspace(5) %0, align 8
; CHECK-NEXT: %1 = getelementptr inbounds nuw %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
@@ -515,15 +515,15 @@ define hidden void @fptr_libcS(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i6
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %fptr_libcS.vararg, align 4, addrspace(5)
; CHECK-NEXT: %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8
-; CHECK-NEXT: %.fca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0
-; CHECK-NEXT: %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %x.coerce1, 1
-; CHECK-NEXT: %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %x.coerce2, 2
-; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3
-; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4
-; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5
+; CHECK-NEXT: %dotfca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0
+; CHECK-NEXT: %dotfca.1.insert = insertvalue %struct.libcS %dotfca.0.insert, i16 %x.coerce1, 1
+; CHECK-NEXT: %dotfca.2.insert = insertvalue %struct.libcS %dotfca.1.insert, i32 %x.coerce2, 2
+; CHECK-NEXT: %dotfca.3.insert = insertvalue %struct.libcS %dotfca.2.insert, i64 %x.coerce3, 3
+; CHECK-NEXT: %dotfca.4.insert = insertvalue %struct.libcS %dotfca.3.insert, float %x.coerce4, 4
+; CHECK-NEXT: %dotfca.5.insert = insertvalue %struct.libcS %dotfca.4.insert, double %x.coerce5, 5
; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %1 = getelementptr inbounds nuw %fptr_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
-; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %1, align 8
+; CHECK-NEXT: store %struct.libcS %dotfca.5.insert, ptr addrspace(5) %1, align 8
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void %0(ptr %2)
; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer)
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 14ddf7daad1c6..9baa7f064b90b 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -216,34 +216,19 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: extract_vector_elt_v3f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: extract_vector_elt_v3f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: extract_vector_elt_v3f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
+; GFX11-NEXT: s_endpgm
%p0 = extractelement <3 x half> %foo, i32 0
%p1 = extractelement <3 x half> %foo, i32 2
%out1 = getelementptr half, ptr addrspace(1) %out, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 365588eaec3ac..2baea2550641a 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -197,26 +197,26 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
-; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
+; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
+; CI-NEXT: v_mov_b32_e32 v2, s1
+; CI-NEXT: v_mov_b32_e32 v3, s0
+; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_fabs_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff
+; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 6bcb086944c91..baf9b0abf7b0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -115,13 +115,13 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: s_and_b32 s1, s2, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
store <2 x float> %fabs, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 7e4b1259db3aa..84ecdd367c808 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -2160,99 +2160,101 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) {
; SI-LABEL: s_copysign_v3f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshr_b32 s6, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_lshr_b32 s0, s2, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s2
+; SI-NEXT: s_lshr_b32 s5, s8, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s5
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s8
; SI-NEXT: s_brev_b32 s0, -2
-; SI-NEXT: v_bfi_b32 v2, s0, v2, v3
+; SI-NEXT: v_bfi_b32 v0, s0, v0, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_bfi_b32 v2, s0, v2, v5
+; SI-NEXT: v_bfi_b32 v1, s0, v1, v4
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_bfi_b32 v1, s0, v1, v5
-; SI-NEXT: v_bfi_b32 v0, s0, v0, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
-; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
-; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_copysign_v3f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_movk_i32 s6, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: s_lshr_b32 s4, s4, 16
; VI-NEXT: s_lshr_b32 s2, s2, 16
-; VI-NEXT: s_lshr_b32 s0, s0, 16
; VI-NEXT: v_bfi_b32 v0, s6, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_bfi_b32 v1, s6, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_add_u32 s2, s0, 4
; VI-NEXT: v_bfi_b32 v3, s6, v0, v1
-; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_short v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_store_short v[0:1], v3
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_copysign_v3f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: s_lshr_b32 s5, s6, 16
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
-; GFX9-NEXT: global_store_short v0, v2, s[6:7] offset:4
-; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: s_copysign_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s5
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
@@ -2263,31 +2265,32 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5] offset:4
-; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[4:5]
+; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[0:1] offset:4
+; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: s_copysign_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s6, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
-; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: global_store_b16 v3, v2, s[4:5] offset:4
-; GFX11-FAKE16-NEXT: global_store_b32 v3, v0, s[4:5]
+; GFX11-FAKE16-NEXT: global_store_b16 v3, v2, s[0:1] offset:4
+; GFX11-FAKE16-NEXT: global_store_b32 v3, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
store <3 x half> %out, ptr addrspace(1) %arg_out
@@ -2297,23 +2300,25 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) {
; SI-LABEL: s_copysign_v4f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshr_b32 s8, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT: s_lshr_b32 s0, s2, 16
-; SI-NEXT: s_lshr_b32 s9, s1, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_lshr_b32 s0, s3, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s8
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s3
+; SI-NEXT: s_lshr_b32 s1, s2, 16
+; SI-NEXT: s_lshr_b32 s10, s9, 16
+; SI-NEXT: s_lshr_b32 s11, s8, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s11
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s10
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s9
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v1, s0, v1, v5
; SI-NEXT: v_bfi_b32 v0, s0, v0, v4
@@ -2332,83 +2337,83 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
;
; VI-LABEL: s_copysign_v4f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_movk_i32 s6, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_lshr_b32 s3, s3, 16
-; VI-NEXT: s_lshr_b32 s1, s1, 16
-; VI-NEXT: v_bfi_b32 v0, s6, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s6, v1, v2
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: s_lshr_b32 s0, s5, 16
+; VI-NEXT: s_lshr_b32 s1, s3, 16
+; VI-NEXT: v_bfi_b32 v2, s6, v2, v3
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_bfi_b32 v3, s6, v3, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_lshr_b32 s0, s4, 16
; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_lshr_b32 s0, s0, 16
-; VI-NEXT: v_bfi_b32 v0, s6, v0, v2
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v2, s6, v2, v3
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_bfi_b32 v2, s6, v2, v4
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_bfi_b32 v4, s6, v4, v5
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_copysign_v4f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: s_lshr_b32 s5, s7, 16
; GFX9-NEXT: s_lshr_b32 s3, s3, 16
-; GFX9-NEXT: s_lshr_b32 s1, s1, 16
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: s_lshr_b32 s1, s2, 16
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: s_lshr_b32 s3, s6, 16
+; GFX9-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: s_copysign_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s5, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s4, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s4
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5
@@ -2419,33 +2424,33 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[4:5]
+; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: s_copysign_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s7, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s6, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
-; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2
-; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s6
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2
+; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-FAKE16-NEXT: global_store_b64 v5, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT: global_store_b64 v5, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
store <4 x half> %out, ptr addrspace(1) %arg_out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index fab45c9dc3bc3..d83a75e8fa110 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -472,50 +472,52 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) {
; SI-LABEL: s_test_copysign_v2f32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT: s_brev_b32 s8, -2
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_bfi_b32 v1, s8, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_bfi_b32 v0, s8, v0, v2
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_brev_b32 s0, -2
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_bfi_b32 v1, s0, v0, v1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_bfi_b32 v0, s0, v0, v2
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_test_copysign_v2f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_brev_b32 s6, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_bfi_b32 v1, s6, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_bfi_b32 v0, s6, v2, v0
-; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_bfi_b32 v3, s6, v2, v3
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_bfi_b32 v2, s6, v2, v4
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
store <2 x float> %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 33910947e6fac..04df0bc525ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -938,16 +938,18 @@ entry:
define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX6-FASTFMA-LABEL: s_fdiv_v2f32:
; GFX6-FASTFMA: ; %bb.0: ; %entry
-; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1
+; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1
; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s9
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-FASTFMA-NEXT: s_mov_b32 s4, s0
+; GFX6-FASTFMA-NEXT: s_mov_b32 s5, s1
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s11
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s9
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3
@@ -956,13 +958,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4
; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s2
; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s11, v1
-; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s10
-; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8
+; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s9, v1
+; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4
@@ -972,20 +974,21 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5
-; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s10, v2
-; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s8, v2
+; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-FASTFMA-NEXT: s_endpgm
;
; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32:
; GFX6-SLOWFMA: ; %bb.0: ; %entry
-; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s0
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s3
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s2
+; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0
@@ -995,14 +998,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4
; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5
-; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s2
-; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0
-; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1
+; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s4
+; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2
+; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1
; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2
-; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s3, v0
+; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v0
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0
; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5
@@ -1012,22 +1014,24 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5
-; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s2, v4
-; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v4
+; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-SLOWFMA-NEXT: s_endpgm
;
; GFX7-LABEL: s_fdiv_v2f32:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_mov_b32 s4, s0
+; GFX7-NEXT: s_mov_b32 s5, s1
+; GFX7-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, s11
-; GFX7-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9
+; GFX7-NEXT: v_mov_b32_e32 v0, s9
+; GFX7-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3
@@ -1036,13 +1040,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4
; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2
+; GFX7-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2
; GFX7-NEXT: v_rcp_f32_e32 v4, v3
-; GFX7-NEXT: v_div_fixup_f32 v1, v0, s11, v1
-; GFX7-NEXT: v_mov_b32_e32 v0, s10
-; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8
+; GFX7-NEXT: v_div_fixup_f32 v1, v0, s9, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0
; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4
@@ -1052,19 +1056,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5
-; GFX7-NEXT: v_div_fixup_f32 v0, v0, s10, v2
-; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT: v_div_fixup_f32 v0, v0, s8, v2
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_v2f32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_rcp_f32_e32 v3, v1
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0
@@ -1074,13 +1079,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4
+; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4
; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2
; GFX8-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-NEXT: v_div_fixup_f32 v1, v1, s3, v0
+; GFX8-NEXT: v_div_fixup_f32 v1, v1, s5, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0
; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5
@@ -1090,19 +1094,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_div_fixup_f32 v0, v0, s2, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_div_fixup_f32 v0, v0, s4, v4
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_v2f32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s6, s3, s3, s1
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1
+; GFX10-NEXT: v_div_scale_f32 v0, s4, s7, s7, s3
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -1112,12 +1117,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX10-NEXT: s_denorm_mode 12
-; GFX10-NEXT: v_div_scale_f32 v2, s6, s2, s2, s0
+; GFX10-NEXT: v_div_scale_f32 v2, s4, s6, s6, s2
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
-; GFX10-NEXT: v_div_fixup_f32 v1, v0, s3, s1
-; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0
+; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s3
+; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s6, s2
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3
@@ -1128,19 +1132,18 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, s0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s1
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1
+; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s3
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s5, s3
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_denorm_mode 15
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1151,11 +1154,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX11-NEXT: s_denorm_mode 12
-; GFX11-NEXT: v_div_scale_f32 v2, null, s2, s2, s0
+; GFX11-NEXT: v_div_scale_f32 v2, null, s4, s4, s2
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
-; GFX11-NEXT: v_div_fixup_f32 v1, v0, s3, s1
-; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0
+; GFX11-NEXT: v_div_fixup_f32 v1, v0, s5, s3
+; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s4, s2
; GFX11-NEXT: s_denorm_mode 15
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0
@@ -1167,8 +1170,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, s0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_v2f32:
@@ -1193,58 +1196,60 @@ entry:
define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX67-LABEL: s_fdiv_ulp25_v2f32:
; GFX67: ; %bb.0: ; %entry
-; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX67-NEXT: s_mov_b32 s7, 0xf000
; GFX67-NEXT: s_mov_b32 s6, -1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: v_rcp_f32_e32 v0, s2
-; GFX67-NEXT: v_rcp_f32_e32 v1, s3
-; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1
+; GFX67-NEXT: v_rcp_f32_e32 v0, s8
+; GFX67-NEXT: v_rcp_f32_e32 v1, s9
+; GFX67-NEXT: s_mov_b32 s4, s0
+; GFX67-NEXT: s_mov_b32 s5, s1
+; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX67-NEXT: v_mul_f32_e32 v1, s3, v1
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX67-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_ulp25_v2f32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f32_e32 v0, s2
-; GFX8-NEXT: v_rcp_f32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_rcp_f32_e32 v2, s6
+; GFX8-NEXT: v_rcp_f32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mul_f32_e32 v2, s2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, s3, v3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_ulp25_v2f32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f32_e32 v0, s2
-; GFX10-NEXT: v_rcp_f32_e32 v1, s3
-; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT: v_rcp_f32_e32 v0, s6
+; GFX10-NEXT: v_rcp_f32_e32 v1, s7
+; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, s3, v1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_ulp25_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f32_e32 v0, s2
-; GFX11-NEXT: v_rcp_f32_e32 v1, s3
+; GFX11-NEXT: v_rcp_f32_e32 v0, s6
+; GFX11-NEXT: v_rcp_f32_e32 v1, s7
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_ulp25_v2f32:
@@ -1269,58 +1274,60 @@ entry:
define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX67-LABEL: s_fdiv_v2f32_fast_math:
; GFX67: ; %bb.0: ; %entry
-; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX67-NEXT: s_mov_b32 s7, 0xf000
; GFX67-NEXT: s_mov_b32 s6, -1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: v_rcp_f32_e32 v0, s3
-; GFX67-NEXT: v_rcp_f32_e32 v2, s2
-; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0
-; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2
+; GFX67-NEXT: v_rcp_f32_e32 v0, s9
+; GFX67-NEXT: v_rcp_f32_e32 v2, s8
+; GFX67-NEXT: s_mov_b32 s4, s0
+; GFX67-NEXT: s_mov_b32 s5, s1
+; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0
+; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX67-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_v2f32_fast_math:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f32_e32 v0, s3
-; GFX8-NEXT: v_rcp_f32_e32 v2, s2
-; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_rcp_f32_e32 v2, s7
+; GFX8-NEXT: v_rcp_f32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_v2f32_fast_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f32_e32 v0, s3
-; GFX10-NEXT: v_rcp_f32_e32 v2, s2
-; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2
-; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
+; GFX10-NEXT: v_rcp_f32_e32 v0, s7
+; GFX10-NEXT: v_rcp_f32_e32 v2, s6
+; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2
+; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32_fast_math:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f32_e32 v0, s3
-; GFX11-NEXT: v_rcp_f32_e32 v2, s2
+; GFX11-NEXT: v_rcp_f32_e32 v0, s7
+; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_v2f32_fast_math:
@@ -1345,58 +1352,60 @@ entry:
define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
; GFX67-LABEL: s_fdiv_v2f32_arcp_math:
; GFX67: ; %bb.0: ; %entry
-; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX67-NEXT: s_mov_b32 s7, 0xf000
; GFX67-NEXT: s_mov_b32 s6, -1
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: v_rcp_f32_e32 v0, s3
-; GFX67-NEXT: v_rcp_f32_e32 v2, s2
-; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0
-; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2
+; GFX67-NEXT: v_rcp_f32_e32 v0, s9
+; GFX67-NEXT: v_rcp_f32_e32 v2, s8
+; GFX67-NEXT: s_mov_b32 s4, s0
+; GFX67-NEXT: s_mov_b32 s5, s1
+; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0
+; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2
; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX67-NEXT: s_endpgm
;
; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f32_e32 v0, s3
-; GFX8-NEXT: v_rcp_f32_e32 v2, s2
-; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_rcp_f32_e32 v2, s7
+; GFX8-NEXT: v_rcp_f32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_v2f32_arcp_math:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f32_e32 v0, s3
-; GFX10-NEXT: v_rcp_f32_e32 v2, s2
-; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2
-; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
+; GFX10-NEXT: v_rcp_f32_e32 v0, s7
+; GFX10-NEXT: v_rcp_f32_e32 v2, s6
+; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2
+; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32_arcp_math:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f32_e32 v0, s3
-; GFX11-NEXT: v_rcp_f32_e32 v2, s2
+; GFX11-NEXT: v_rcp_f32_e32 v0, s7
+; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; EG-LABEL: s_fdiv_v2f32_arcp_math:
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index fe5601594dca8..57498989378b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -123,11 +123,11 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_rndne_f32_e32 v1, s3
-; VI-NEXT: v_rndne_f32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rndne_f32_e32 v3, s3
+; VI-NEXT: v_rndne_f32_e32 v2, s2
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 9642b36ecb7e8..766921df3cdff 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -478,13 +478,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
-; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v3, s1
-; CIVI-NEXT: v_mov_b32_e32 v0, s2
-; CIVI-NEXT: v_mov_b32_e32 v1, s3
-; CIVI-NEXT: v_mov_b32_e32 v2, s0
-; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: s_or_b32 s0, s3, 0x80008000
+; CIVI-NEXT: s_or_b32 s1, s2, 0x80008000
+; CIVI-NEXT: v_mov_b32_e32 v2, s1
+; CIVI-NEXT: v_mov_b32_e32 v3, s0
+; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CIVI-NEXT: s_endpgm
;
; GFX9-LABEL: fneg_fabs_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 17e509acfb6e6..54ce02740e239 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -215,13 +215,13 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: s_or_b32 s1, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
%fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 43caa4c739fb3..718cc4afe22da 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -179,22 +179,22 @@ entry:
define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
; SI-LABEL: fshl_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1
-; SI-NEXT: s_not_b32 s3, s5
-; SI-NEXT: s_lshr_b32 s1, s1, 1
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_not_b32 s1, s4
-; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; SI-NEXT: s_lshr_b32 s0, s0, 1
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_not_b32 s1, s7
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: v_alignbit_b32 v0, s3, v0, 1
+; SI-NEXT: s_lshr_b32 s0, s3, 1
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_not_b32 s1, s6
+; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1
+; SI-NEXT: s_lshr_b32 s0, s2, 1
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -202,47 +202,43 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; VI-LABEL: fshl_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: s_not_b32 s7, s7
-; VI-NEXT: s_lshr_b32 s3, s1, 1
-; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_not_b32 s1, s6
-; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; VI-NEXT: s_lshr_b32 s0, s0, 1
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: s_not_b32 s1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 s0, s3, 1
+; VI-NEXT: v_alignbit_b32 v2, s3, v2, 1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_alignbit_b32 v3, s0, v2, v3
; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_not_b32 s1, s6
+; VI-NEXT: v_alignbit_b32 v2, s2, v2, 1
+; VI-NEXT: s_lshr_b32 s0, s2, 1
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_alignbit_b32 v2, s0, v2, v4
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: s_lshr_b32 s3, s1, 1
-; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1
-; GFX9-NEXT: s_not_b32 s1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s13
+; GFX9-NEXT: s_not_b32 s1, s15
+; GFX9-NEXT: s_lshr_b32 s0, s11, 1
+; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_not_b32 s1, s8
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1
-; GFX9-NEXT: s_lshr_b32 s0, s0, 1
+; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: s_not_b32 s1, s14
+; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1
+; GFX9-NEXT: s_lshr_b32 s0, s10, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32:
@@ -265,40 +261,34 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; GFX10-LABEL: fshl_v2i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1
-; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_not_b32 s2, s7
-; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_not_b32 s3, s6
-; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2
-; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3
+; GFX10-NEXT: v_alignbit_b32 v0, s11, s13, 1
+; GFX10-NEXT: v_alignbit_b32 v3, s10, s12, 1
+; GFX10-NEXT: s_lshr_b32 s0, s11, 1
+; GFX10-NEXT: s_not_b32 s1, s15
+; GFX10-NEXT: s_lshr_b32 s2, s10, 1
+; GFX10-NEXT: s_not_b32 s3, s14
+; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1
+; GFX10-NEXT: v_alignbit_b32 v0, s2, v3, s3
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1
-; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_not_b32 s2, s7
-; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_not_b32 s3, s6
-; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2
-; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_alignbit_b32 v0, s3, s5, 1
+; GFX11-NEXT: v_alignbit_b32 v3, s2, s4, 1
+; GFX11-NEXT: s_lshr_b32 s3, s3, 1
+; GFX11-NEXT: s_not_b32 s4, s7
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
+; GFX11-NEXT: s_not_b32 s5, s6
+; GFX11-NEXT: v_alignbit_b32 v1, s3, v0, s4
+; GFX11-NEXT: v_alignbit_b32 v0, s2, v3, s5
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -309,43 +299,45 @@ entry:
define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
; SI-LABEL: fshl_v2i32_imm:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23
-; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
+; SI-NEXT: v_alignbit_b32 v1, s3, v0, 23
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshl_v2i32_imm:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23
-; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v3, s3, v2, 23
+; VI-NEXT: v_alignbit_b32 v2, s2, v4, 25
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v2i32_imm:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, 23
+; GFX9-NEXT: v_alignbit_b32 v0, s2, v3, 25
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32_imm:
@@ -365,25 +357,25 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX10-LABEL: fshl_v2i32_imm:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT: v_alignbit_b32 v1, s3, s7, 23
+; GFX10-NEXT: v_alignbit_b32 v0, s2, s6, 25
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_v2i32_imm:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_alignbit_b32 v1, s3, s5, 23
+; GFX11-NEXT: v_alignbit_b32 v0, s2, s4, 25
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 816c3fccbb237..706ea1ded9373 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -221,51 +221,47 @@ entry:
define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
; SI-LABEL: fshr_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_mov_b32_e32 v1, s9
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v2, s8
-; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshr_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_alignbit_b32 v3, s3, v2, v3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v2, s2, v4, v2
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
+; GFX9-NEXT: v_mov_b32_e32 v0, s13
+; GFX9-NEXT: v_mov_b32_e32 v1, s15
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
+; GFX9-NEXT: v_mov_b32_e32 v4, s14
+; GFX9-NEXT: v_alignbit_b32 v1, s11, v0, v1
+; GFX9-NEXT: v_alignbit_b32 v0, s10, v3, v4
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: s_endpgm
;
@@ -285,79 +281,64 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; GFX10-LABEL: fshr_v2i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s7
-; GFX10-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, s15
+; GFX10-NEXT: v_mov_b32_e32 v2, s14
+; GFX10-NEXT: v_alignbit_b32 v1, s11, s13, v0
+; GFX10-NEXT: v_alignbit_b32 v0, s10, s12, v2
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9]
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: fshr_v2i32:
; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_clause 0x2
-; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l
-; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h
-; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s3, s5, v0.l
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s2, s4, v0.h
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fshr_v2i32:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_clause 0x2
-; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2
-; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s3, s5, v0
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s2, s4, v2
+; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-TRUE16-LABEL: fshr_v2i32:
; GFX12-TRUE16: ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT: s_clause 0x2
-; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l
-; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h
-; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s3, s5, v0.l
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s2, s4, v0.h
+; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
;
; GFX12-FAKE16-LABEL: fshr_v2i32:
; GFX12-FAKE16: ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT: s_clause 0x2
-; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2
-; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5]
+; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s3, s5, v0
+; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s2, s4, v2
+; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX12-FAKE16-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -368,43 +349,45 @@ entry:
define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
; SI-LABEL: fshr_v2i32_imm:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_mov_b32_e32 v2, s2
-; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9
-; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
+; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fshr_v2i32_imm:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9
-; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v3, s3, v2, 9
+; VI-NEXT: v_alignbit_b32 v2, s2, v4, 7
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v2i32_imm:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9
-; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, 9
+; GFX9-NEXT: v_alignbit_b32 v0, s2, v3, 7
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v2i32_imm:
@@ -424,37 +407,37 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX10-LABEL: fshr_v2i32_imm:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT: v_alignbit_b32 v1, s3, s7, 9
+; GFX10-NEXT: v_alignbit_b32 v0, s2, s6, 7
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshr_v2i32_imm:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_alignbit_b32 v1, s3, s5, 9
+; GFX11-NEXT: v_alignbit_b32 v0, s2, s4, 7
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fshr_v2i32_imm:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: v_alignbit_b32 v1, s3, s5, 9
+; GFX12-NEXT: v_alignbit_b32 v0, s2, s4, 7
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index a2fca33af1046..e95437b9f27e2 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -82,16 +82,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_add_u32 s4, s0, 4
-; CIVI-NEXT: s_addc_u32 s5, s1, 0
-; CIVI-NEXT: v_mov_b32_e32 v2, s4
-; CIVI-NEXT: v_mov_b32_e32 v4, s3
; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v3, s5
; CIVI-NEXT: v_mov_b32_e32 v1, s1
-; CIVI-NEXT: v_mov_b32_e32 v5, s2
-; CIVI-NEXT: flat_store_short v[2:3], v4
-; CIVI-NEXT: flat_store_dword v[0:1], v5
+; CIVI-NEXT: v_mov_b32_e32 v2, s2
+; CIVI-NEXT: s_add_u32 s0, s0, 4
+; CIVI-NEXT: flat_store_dword v[0:1], v2
+; CIVI-NEXT: s_addc_u32 s1, s1, 0
+; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: v_mov_b32_e32 v2, s3
+; CIVI-NEXT: flat_store_short v[0:1], v2
; CIVI-NEXT: s_endpgm
;
; GFX11-LABEL: load_v3f16_arg:
@@ -116,8 +116,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: v_mov_b32_e32 v3, s3
; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CIVI-NEXT: s_endpgm
@@ -125,9 +125,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; GFX11-LABEL: load_v4f16_arg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
store <4 x half> %arg, ptr addrspace(1) %out
@@ -305,31 +305,18 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
}
define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
-; CI-LABEL: extload_v3f16_to_v3f32_arg:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s4, s2, 16
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v4, s1
-; CI-NEXT: v_mov_b32_e32 v3, s0
-; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: extload_v3f16_to_v3f32_arg:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
-; VI-NEXT: s_endpgm
+; CIVI-LABEL: extload_v3f16_to_v3f32_arg:
+; CIVI: ; %bb.0:
+; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_waitcnt lgkmcnt(0)
+; CIVI-NEXT: s_lshr_b32 s4, s2, 16
+; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s4
+; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; CIVI-NEXT: v_mov_b32_e32 v3, s0
+; CIVI-NEXT: v_mov_b32_e32 v4, s1
+; CIVI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
+; CIVI-NEXT: s_endpgm
;
; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
; GFX11: ; %bb.0:
@@ -337,9 +324,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s4, s2, 16
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX11-NEXT: s_endpgm
%ext = fpext <3 x half> %arg to <3 x float>
@@ -352,14 +339,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s4, s3, 16
-; CI-NEXT: s_lshr_b32 s5, s2, 16
+; CI-NEXT: s_lshr_b32 s4, s2, 16
+; CI-NEXT: s_lshr_b32 s5, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s5
+; CI-NEXT: v_cvt_f32_f16_e32 v3, s5
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
+; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
@@ -369,12 +356,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
; VI-NEXT: v_cvt_f32_f16_e32 v3, s4
; VI-NEXT: v_cvt_f32_f16_e32 v1, s5
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -385,10 +372,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s4, s3, 16
; GFX11-NEXT: s_lshr_b32 s5, s2, 16
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
%ext = fpext <4 x half> %arg to <4 x float>
@@ -645,55 +632,30 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
}
define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
-; CI-LABEL: extload_v4f16_to_v4f64_arg:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s4, s3, 16
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
-; CI-NEXT: s_lshr_b32 s5, s2, 16
-; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v6, s5
-; CI-NEXT: s_add_u32 s2, s0, 16
-; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; CI-NEXT: s_addc_u32 s3, s1, 0
-; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; CI-NEXT: v_mov_b32_e32 v9, s3
-; CI-NEXT: v_mov_b32_e32 v8, s2
-; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; CI-NEXT: s_nop 0
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: extload_v4f16_to_v4f64_arg:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s5
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; VI-NEXT: v_mov_b32_e32 v9, s3
-; VI-NEXT: v_mov_b32_e32 v8, s2
-; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; VI-NEXT: s_nop 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; VI-NEXT: s_endpgm
+; CIVI-LABEL: extload_v4f16_to_v4f64_arg:
+; CIVI: ; %bb.0:
+; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_waitcnt lgkmcnt(0)
+; CIVI-NEXT: s_lshr_b32 s5, s3, 16
+; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s5
+; CIVI-NEXT: s_lshr_b32 s4, s2, 16
+; CIVI-NEXT: v_cvt_f32_f16_e32 v4, s2
+; CIVI-NEXT: v_cvt_f32_f16_e32 v6, s4
+; CIVI-NEXT: s_add_u32 s2, s0, 16
+; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; CIVI-NEXT: s_addc_u32 s3, s1, 0
+; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; CIVI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; CIVI-NEXT: v_mov_b32_e32 v9, s3
+; CIVI-NEXT: v_mov_b32_e32 v8, s2
+; CIVI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; CIVI-NEXT: s_nop 0
+; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CIVI-NEXT: s_endpgm
;
; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
; GFX11: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 6f0c850117208..3b09d19960263 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,20 +289,20 @@ entry:
define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
; GCN-LABEL: half4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_load_dword s5, s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s6, s6, 4
-; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_lshl_b32 s0, s5, 4
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
@@ -418,20 +418,20 @@ entry:
define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
; GCN-LABEL: short4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_load_dword s5, s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s4, 0x10001
-; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s6, s6, 4
-; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_lshl_b32 s0, s5, 4
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
+; GCN-NEXT: s_mov_b32 s5, s4
+; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
@@ -443,19 +443,19 @@ entry:
define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
; GCN-LABEL: byte8_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s6, s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s4, s6, 3
-; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4
-; GCN-NEXT: s_and_b32 s7, s5, 0x1010101
-; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
-; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_lshl_b32 s0, s4, 3
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_lshl_b64 s[0:1], 0xff, s0
+; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GCN-NEXT: s_and_b32 s1, s1, 0x1010101
+; GCN-NEXT: s_and_b32 s0, s0, 0x1010101
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 20d2b12a1ebfe..ce622343140ae 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1555,10 +1555,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
; SI-NEXT: s_lshl_b32 s0, s8, 4
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
-; SI-NEXT: s_and_b32 s9, s1, 0x50005
-; SI-NEXT: s_and_b32 s8, s0, 0x50005
-; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1]
-; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT: s_and_b32 s1, s1, 0x50005
+; SI-NEXT: s_and_b32 s0, s0, 0x50005
+; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 9df995b5a7066..9f3959c39d1af 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s0, 4
-; VI-NEXT: s_addc_u32 s5, s1, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v5, s2
-; VI-NEXT: flat_store_short v[2:3], v4
-; VI-NEXT: flat_store_dword v[0:1], v5
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s0, s0, 4
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v3i16_arg:
@@ -1328,8 +1328,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -2393,8 +2393,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index fdccacf372dfa..e5c4b9209521e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-SDAG-LABEL: s_exp_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3fb8a000
+; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000
-; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
-; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5
-; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0
+; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
+; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7
+; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 0c2e6f82c9115..7f4f4c9d0fa89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-LABEL: s_exp10_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x40549000
+; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4
-; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0
+; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4
-; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000
-; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1
-; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2
-; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v6
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
-; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5
-; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6
-; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
-; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5
-; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc23369f4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0
+; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x40549000, v8
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
+; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
+; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
+; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8
+; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4
+; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7
+; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index c34113a5dfab0..0327795810c64 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -225,26 +225,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-LABEL: s_exp2_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT: v_add_f32_e32 v2, s2, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0
-; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT: s_cselect_b32 s3, 0xffffffc0, 0
-; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3
-; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
-; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
+; VI-SDAG-NEXT: v_ldexp_f32 v3, v4, s0
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0
+; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, s0
+; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index ff8b539fd5ebb..910af766ad553 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -404,51 +404,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-SDAG-LABEL: s_log_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3
-; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6
+; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3f317000, v7
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7
+; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
+; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT: v_log_f32_e32 v7, v3
+; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7
+; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5
-; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5
; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT: v_log_f32_e32 v5, v1
-; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 4f783589f148f..10e6b61a297ae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -404,51 +404,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-LABEL: s_log10_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4
-; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
-; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3
-; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4
-; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6
+; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6
+; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3e9a2000, v7
+; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7
+; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
+; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
+; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3
+; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7
+; VI-SDAG-NEXT: v_log_f32_e32 v7, v3
+; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4
+; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7
+; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5
; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5
-; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4
+; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5
; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4
-; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5
-; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5
-; VI-SDAG-NEXT: v_log_f32_e32 v5, v1
-; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5
-; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3
-; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
+; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log10_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index a98baa2fdb35c..8a17633ae003d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -283,16 +283,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3
; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; VI-SDAG-NEXT: v_ldexp_f32 v0, s2, v0
; VI-SDAG-NEXT: v_log_f32_e32 v3, v3
-; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
-; VI-SDAG-NEXT: v_log_f32_e32 v4, v1
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-SDAG-NEXT: v_log_f32_e32 v5, v0
+; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2
+; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4
+; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log2_v2f32:
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index aaf81e2fa4000..b57fa3db183ed 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -855,99 +855,99 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
;
; CI-LABEL: s_test_imin_sle_v4i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
-; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_ashr_i32 s6, s0, 16
-; CI-NEXT: s_ashr_i32 s7, s1, 16
-; CI-NEXT: s_sext_i32_i16 s0, s0
-; CI-NEXT: s_sext_i32_i16 s1, s1
-; CI-NEXT: s_ashr_i32 s8, s2, 16
-; CI-NEXT: s_ashr_i32 s9, s3, 16
-; CI-NEXT: s_sext_i32_i16 s2, s2
-; CI-NEXT: s_sext_i32_i16 s3, s3
-; CI-NEXT: s_min_i32 s7, s7, s9
-; CI-NEXT: s_min_i32 s1, s1, s3
-; CI-NEXT: s_min_i32 s3, s6, s8
-; CI-NEXT: s_min_i32 s0, s0, s2
-; CI-NEXT: s_lshl_b32 s7, s7, 16
-; CI-NEXT: s_and_b32 s1, s1, 0xffff
-; CI-NEXT: s_lshl_b32 s3, s3, 16
-; CI-NEXT: s_and_b32 s0, s0, 0xffff
-; CI-NEXT: s_or_b32 s1, s1, s7
-; CI-NEXT: s_or_b32 s0, s0, s3
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_mov_b32_e32 v3, s5
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_ashr_i32 s0, s2, 16
+; CI-NEXT: s_ashr_i32 s1, s3, 16
+; CI-NEXT: s_sext_i32_i16 s2, s2
+; CI-NEXT: s_sext_i32_i16 s3, s3
+; CI-NEXT: s_ashr_i32 s6, s4, 16
+; CI-NEXT: s_ashr_i32 s7, s5, 16
+; CI-NEXT: s_sext_i32_i16 s4, s4
+; CI-NEXT: s_sext_i32_i16 s5, s5
+; CI-NEXT: s_min_i32 s1, s1, s7
+; CI-NEXT: s_min_i32 s3, s3, s5
+; CI-NEXT: s_min_i32 s0, s0, s6
+; CI-NEXT: s_min_i32 s2, s2, s4
+; CI-NEXT: s_lshl_b32 s1, s1, 16
+; CI-NEXT: s_and_b32 s3, s3, 0xffff
+; CI-NEXT: s_lshl_b32 s0, s0, 16
+; CI-NEXT: s_and_b32 s2, s2, 0xffff
+; CI-NEXT: s_or_b32 s1, s3, s1
+; CI-NEXT: s_or_b32 s0, s2, s0
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_sle_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s6, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s8, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_ashr_i32 s7, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s9, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_min_i32 s6, s6, s8
-; VI-NEXT: s_min_i32 s1, s1, s3
-; VI-NEXT: s_min_i32 s7, s7, s9
-; VI-NEXT: s_min_i32 s0, s0, s2
-; VI-NEXT: s_lshl_b32 s2, s6, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s1, s1, s2
-; VI-NEXT: s_lshl_b32 s2, s7, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_ashr_i32 s0, s3, 16
+; VI-NEXT: s_ashr_i32 s1, s2, 16
+; VI-NEXT: s_sext_i32_i16 s3, s3
+; VI-NEXT: s_sext_i32_i16 s2, s2
+; VI-NEXT: s_ashr_i32 s6, s5, 16
+; VI-NEXT: s_ashr_i32 s7, s4, 16
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_min_i32 s1, s1, s7
+; VI-NEXT: s_min_i32 s0, s0, s6
+; VI-NEXT: s_min_i32 s2, s2, s4
+; VI-NEXT: s_min_i32 s3, s3, s5
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_or_b32 s0, s3, s0
+; VI-NEXT: s_or_b32 s1, s2, s1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_sle_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_pk_min_i16 v1, s1, v0
-; GFX9-NEXT: v_pk_min_i16 v0, s0, v3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_pk_min_i16 v1, s3, v0
+; GFX9-NEXT: v_pk_min_i16 v0, s2, v3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_sle_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_min_i16 v1, s1, s3
-; GFX10-NEXT: v_pk_min_i16 v0, s0, s2
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT: v_pk_min_i16 v1, s3, s5
+; GFX10-NEXT: v_pk_min_i16 v0, s2, s4
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_sle_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_i16 v1, s1, s3
-; GFX11-NEXT: v_pk_min_i16 v0, s0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_pk_min_i16 v1, s3, s5
+; GFX11-NEXT: v_pk_min_i16 v0, s2, s4
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%cmp = icmp sle <4 x i16> %a, %b
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -1303,71 +1303,71 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
;
; CI-LABEL: s_test_imin_slt_v2i32:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
-; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_min_i32 s1, s1, s3
-; CI-NEXT: s_min_i32 s0, s0, s2
-; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_mov_b32_e32 v3, s5
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_min_i32 s0, s3, s5
+; CI-NEXT: s_min_i32 s1, s2, s4
+; CI-NEXT: v_mov_b32_e32 v2, s1
+; CI-NEXT: v_mov_b32_e32 v3, s0
+; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_test_imin_slt_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_min_i32 s1, s1, s3
-; VI-NEXT: s_min_i32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_min_i32 s0, s3, s5
+; VI-NEXT: s_min_i32 s1, s2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_imin_slt_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_min_i32 s1, s1, s3
-; GFX9-NEXT: s_min_i32 s0, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: s_min_i32 s3, s3, s5
+; GFX9-NEXT: s_min_i32 s2, s2, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_imin_slt_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_min_i32 s0, s0, s2
-; GFX10-NEXT: s_min_i32 s1, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT: s_min_i32 s2, s2, s4
+; GFX10-NEXT: s_min_i32 s3, s3, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_imin_slt_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_min_i32 s0, s0, s2
-; GFX11-NEXT: s_min_i32 s1, s1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: s_min_i32 s2, s2, s4
+; GFX11-NEXT: s_min_i32 s3, s3, s5
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%cmp = icmp slt <2 x i32> %a, %b
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 28a995e74f7ab..5027f146e08fd 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -2069,7 +2069,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX900-LABEL: fadd_fadd_fsub:
; GFX900: ; %bb.0: ; %bb
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX900-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX900-NEXT: v_mov_b32_e32 v2, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 017b37af4cdf2..f65fef5620a33 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -93,62 +93,64 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; SI-LABEL: rotl_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s3, 32, s3
-; SI-NEXT: s_sub_i32 s2, 32, s2
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_sub_i32 s0, 32, s9
+; SI-NEXT: s_sub_i32 s1, 32, s8
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_alignbit_b32 v1, s3, s3, v0
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; GFX8-LABEL: rotl_v2i32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 32, s2
-; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_sub_i32 s1, 32, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v2
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: rotl_v2i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s3, 32, s3
-; GFX10-NEXT: s_sub_i32 s2, 32, s2
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT: s_sub_i32 s4, 32, s7
+; GFX10-NEXT: s_sub_i32 s5, 32, s6
+; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s4
+; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s5
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotl_v2i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s3, 32, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: s_sub_i32 s4, 32, s7
+; GFX11-NEXT: s_sub_i32 s5, 32, s6
+; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s4
+; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s5
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
%0 = shl <2 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index db56589b799dd..6498abc117c7e 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -82,54 +82,56 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
;
; SI-LABEL: rotr_v2i32:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s9
+; SI-NEXT: v_alignbit_b32 v1, s3, s3, v0
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; GFX8-LABEL: rotr_v2i32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2
+; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v4
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: rotr_v2i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s7
+; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s6
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotr_v2i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s5
+; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s4
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
%tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 840f3554b9457..7617b16ec684b 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -331,39 +331,39 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: s_sub_i32 s6, 0, s3
-; VI-NEXT: s_sub_i32 s7, 0, s2
-; VI-NEXT: s_sub_i32 s5, 0, s5
-; VI-NEXT: s_sub_i32 s4, 0, s4
-; VI-NEXT: s_ashr_i32 s8, s2, 16
-; VI-NEXT: s_ashr_i32 s9, s3, 16
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s1, s3, 16
+; VI-NEXT: s_sub_i32 s4, 0, s3
+; VI-NEXT: s_sub_i32 s5, 0, s2
+; VI-NEXT: s_sub_i32 s1, 0, s1
+; VI-NEXT: s_sub_i32 s0, 0, s0
+; VI-NEXT: s_ashr_i32 s6, s2, 16
+; VI-NEXT: s_ashr_i32 s7, s3, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_sext_i32_i16 s7, s7
-; VI-NEXT: s_sext_i32_i16 s6, s6
-; VI-NEXT: s_sext_i32_i16 s4, s4
; VI-NEXT: s_sext_i32_i16 s5, s5
-; VI-NEXT: s_max_i32 s3, s3, s6
-; VI-NEXT: s_max_i32 s2, s2, s7
-; VI-NEXT: s_max_i32 s5, s9, s5
-; VI-NEXT: s_max_i32 s4, s8, s4
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_sext_i32_i16 s0, s0
+; VI-NEXT: s_sext_i32_i16 s1, s1
+; VI-NEXT: s_max_i32 s3, s3, s4
+; VI-NEXT: s_max_i32 s2, s2, s5
+; VI-NEXT: s_max_i32 s1, s7, s1
+; VI-NEXT: s_max_i32 s0, s6, s0
; VI-NEXT: s_add_i32 s2, s2, 2
; VI-NEXT: s_add_i32 s3, s3, 2
-; VI-NEXT: s_lshl_b32 s4, s4, 16
-; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_lshl_b32 s1, s1, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_or_b32 s3, s5, s3
-; VI-NEXT: s_or_b32 s2, s4, s2
-; VI-NEXT: s_add_i32 s3, s3, 0x20000
-; VI-NEXT: s_add_i32 s2, s2, 0x20000
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b32 s1, s1, s3
+; VI-NEXT: s_or_b32 s0, s0, s2
+; VI-NEXT: s_add_i32 s1, s1, 0x20000
+; VI-NEXT: s_add_i32 s0, s0, 0x20000
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; CI-LABEL: s_abs_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index a23d99bcf3b9f..bc210965d880e 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -156,90 +156,93 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
;
; GFX6-LABEL: test_udivrem_v2:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT: s_sub_i32 s6, 0, s2
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9
+; GFX6-NEXT: s_sub_i32 s4, 0, s8
+; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_lo_u32 v2, s6, v0
-; GFX6-NEXT: s_sub_i32 s6, 0, s3
-; GFX6-NEXT: v_mul_lo_u32 v3, s6, v1
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: v_mul_lo_u32 v2, s4, v0
+; GFX6-NEXT: s_sub_i32 s4, 0, s9
+; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1
+; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0
-; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v1
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
+; GFX6-NEXT: v_mul_hi_u32 v1, s3, v1
+; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8
+; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v1
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
+; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1
+; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1
+; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: test_udivrem_v2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX8-NEXT: s_sub_i32 s6, 0, s2
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX8-NEXT: s_sub_i32 s0, 0, s6
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_mul_lo_u32 v2, s6, v0
-; GFX8-NEXT: s_sub_i32 s6, 0, s3
-; GFX8-NEXT: v_mul_lo_u32 v3, s6, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0
+; GFX8-NEXT: s_sub_i32 s0, 0, s7
+; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
-; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX8-NEXT: v_mul_lo_u32 v1, v1, s3
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s1, v1
-; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s3, v1
-; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s3, v1
-; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mul_hi_u32 v2, s2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s3, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v2, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mul_lo_u32 v3, v3, s7
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v2
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s3, v3
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s7, v3
+; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v2
+; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s7, v3
+; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
%result0 = udiv <2 x i32> %x, %y
store <2 x i32> %result0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 55cbc14a46706..996819f18d5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -225,8 +225,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
%cast = uitofp <2 x i32> %in to <2 x double>
More information about the llvm-commits
mailing list